diff options
Diffstat (limited to 'net')
442 files changed, 9966 insertions, 8130 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c index 7008d53e455c..e61679bf0908 100644 --- a/net/6lowpan/nhc.c +++ b/net/6lowpan/nhc.c @@ -18,7 +18,7 @@ #include "nhc.h" static struct rb_root rb_root = RB_ROOT; -static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX]; +static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX + 1]; static DEFINE_SPINLOCK(lowpan_nhc_lock); static int lowpan_nhc_insert(struct lowpan_nhc *nhc) diff --git a/net/802/garp.c b/net/802/garp.c index b38ee6dcba45..5239b8f244e7 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -206,6 +206,19 @@ static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr kfree(attr); } +static void garp_attr_destroy_all(struct garp_applicant *app) +{ + struct rb_node *node, *next; + struct garp_attr *attr; + + for (node = rb_first(&app->gid); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct garp_attr, node); + garp_attr_destroy(app, attr); + } +} + static int garp_pdu_init(struct garp_applicant *app) { struct sk_buff *skb; @@ -612,6 +625,7 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl spin_lock_bh(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); + garp_attr_destroy_all(app); garp_pdu_queue(app); spin_unlock_bh(&app->lock); diff --git a/net/802/mrp.c b/net/802/mrp.c index 72db2785ef2c..4ee3af3d400b 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -295,6 +295,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) kfree(attr); } +static void mrp_attr_destroy_all(struct mrp_applicant *app) +{ + struct rb_node *node, *next; + struct mrp_attr *attr; + + for (node = rb_first(&app->mad); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct mrp_attr, node); + mrp_attr_destroy(app, attr); + } +} + static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; @@ -900,6 +913,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); + mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock); diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index cc1557978066..ecdfeaafba9c 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -109,6 +109,7 @@ int vlan_check_real_dev(struct net_device *real_dev, void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); +void vlan_dev_uninit(struct net_device *dev); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 5139c4ebb96b..22f4e5292278 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -606,7 +606,8 @@ static int vlan_dev_init(struct net_device *dev) return 0; } -static void vlan_dev_uninit(struct net_device *dev) +/* Note: this function might be called multiple times for the same device. */ +void vlan_dev_uninit(struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index c92b52f37d38..7c95a16c1cef 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -92,11 +92,13 @@ static int vlan_changelink(struct net_device *dev, struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; - int rem; + int rem, err; if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); - vlan_dev_change_flags(dev, flags->flags, flags->mask); + err = vlan_dev_change_flags(dev, flags->flags, flags->mask); + if (err) + return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { @@ -107,7 +109,9 @@ static int vlan_changelink(struct net_device *dev, if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); - vlan_dev_set_egress_priority(dev, m->from, m->to); + err = vlan_dev_set_egress_priority(dev, m->from, m->to); + if (err) + return err; } } return 0; @@ -150,10 +154,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; err = vlan_changelink(dev, tb, data); - if (err < 0) - return err; - - return register_vlan_dev(dev); + if (!err) + err = register_vlan_dev(dev); + if (err) + vlan_dev_uninit(dev); + return err; } static inline size_t vlan_qos_map_size(unsigned int n) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 2f68ffda3715..6f8e84844bb2 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -793,20 +793,28 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd) return -ENOMEM; ts->rd = fget(rfd); + if (!ts->rd) + goto out_free_ts; + if (!(ts->rd->f_mode & FMODE_READ)) + goto out_put_rd; ts->wr = fget(wfd); - if (!ts->rd || !ts->wr) { - if (ts->rd) - fput(ts->rd); - if (ts->wr) - fput(ts->wr); - kfree(ts); - return -EIO; - } + if (!ts->wr) + goto out_put_rd; + if (!(ts->wr->f_mode & FMODE_WRITE)) + goto out_put_wr; client->trans = ts; client->status = Connected; return 0; + +out_put_wr: + fput(ts->wr); +out_put_rd: + fput(ts->rd); +out_free_ts: + kfree(ts); + return -EIO; } static int p9_socket_open(struct p9_client *client, struct socket *csocket) @@ -983,7 +991,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) csocket = NULL; - if (addr == NULL) + if (!addr || !strlen(addr)) return -EINVAL; if (strlen(addr) >= UNIX_PATH_MAX) { diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 5892bd1457d4..252a4c22898e 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -605,7 +605,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); if (!chan->vc_wq) { err = -ENOMEM; - goto out_free_tag; + goto out_remove_file; } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; @@ -623,6 +623,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) return 0; +out_remove_file: + sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr); out_free_tag: kfree(tag); out_free_vq: diff --git a/net/Kconfig b/net/Kconfig index d9da78da8cd9..539a7a01383f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -270,6 +270,7 @@ config XPS config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS + select SOCK_CGROUP_DATA ---help--- Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis. @@ -277,6 +278,7 @@ config CGROUP_NET_PRIO config CGROUP_NET_CLASSID bool "Network classid cgroup" depends on CGROUPS + select SOCK_CGROUP_DATA ---help--- Cgroup subsystem for use as general purpose socket classid marker that is being used in cls_cgroup and for netfilter matching. @@ -293,14 +295,17 @@ config BQL config BPF_JIT bool "enable BPF Just In Time compiler" - depends on HAVE_BPF_JIT + depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT depends on MODULES ---help--- Berkeley Packet Filter filtering capabilities are normally handled by an interpreter. This option allows kernel to generate a native code when filter is loaded in memory. This should speedup - packet sniffing (libpcap/tcpdump). Note : Admin should enable - this feature changing /proc/sys/net/core/bpf_jit_enable + packet sniffing (libpcap/tcpdump). + + Note, admin should enable this feature changing: + /proc/sys/net/core/bpf_jit_enable + /proc/sys/net/core/bpf_jit_harden (optional) config NET_FLOW_LIMIT bool @@ -417,9 +422,15 @@ config DST_CACHE endif # if NET -# Used by archs to tell that they support BPF_JIT -config HAVE_BPF_JIT +# Used by archs to tell that they support BPF JIT compiler plus which flavour. +# Only one of the two can be selected for a specific arch since eBPF JIT supersedes +# the cBPF JIT. + +# Classic BPF JIT (cBPF) +config HAVE_CBPF_JIT bool +# Extended BPF JIT (eBPF) config HAVE_EBPF_JIT bool + diff --git a/net/Makefile b/net/Makefile index e700aa62b1af..8b139678bb0b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ -obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_UNIX_SCM) += unix/ obj-$(CONFIG_NET) += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 8ad3ec2610b6..b9e85a4751a6 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -879,15 +879,24 @@ static struct notifier_block aarp_notifier = { static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 }; -void __init aarp_proto_init(void) +int __init aarp_proto_init(void) { + int rc; + aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv); - if (!aarp_dl) + if (!aarp_dl) { printk(KERN_CRIT "Unable to register AARP with SNAP.\n"); + return -ENOMEM; + } setup_timer(&aarp_timer, aarp_expire_timeout, 0); aarp_timer.expires = jiffies + sysctl_aarp_expiry_time; add_timer(&aarp_timer); - register_netdevice_notifier(&aarp_notifier); + rc = register_netdevice_notifier(&aarp_notifier); + if (rc) { + del_timer_sync(&aarp_timer); + unregister_snap_client(aarp_dl); + } + return rc; } /* Remove the AARP entries associated with a device. */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index e23bf739492c..1048cddcc9a3 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1575,8 +1575,8 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sk_buff *skb; struct net_device *dev; struct ddpehdr *ddp; - int size; - struct atalk_route *rt; + int size, hard_header_len; + struct atalk_route *rt, *rt_lo = NULL; int err; if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) @@ -1639,7 +1639,22 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n", sk, size, dev->name); - size += dev->hard_header_len; + hard_header_len = dev->hard_header_len; + /* Leave room for loopback hardware header if necessary */ + if (usat->sat_addr.s_node == ATADDR_BCAST && + (dev->flags & IFF_LOOPBACK || !(rt->flags & RTF_GATEWAY))) { + struct atalk_addr at_lo; + + at_lo.s_node = 0; + at_lo.s_net = 0; + + rt_lo = atrtr_find(&at_lo); + + if (rt_lo && rt_lo->dev->hard_header_len > hard_header_len) + hard_header_len = rt_lo->dev->hard_header_len; + } + + size += hard_header_len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err); lock_sock(sk); @@ -1647,7 +1662,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; skb_reserve(skb, ddp_dl->header_length); - skb_reserve(skb, dev->hard_header_len); + skb_reserve(skb, hard_header_len); skb->dev = dev; SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk); @@ -1698,18 +1713,12 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* loop back */ skb_orphan(skb); if (ddp->deh_dnode == ATADDR_BCAST) { - struct atalk_addr at_lo; - - at_lo.s_node = 0; - at_lo.s_net = 0; - - rt = atrtr_find(&at_lo); - if (!rt) { + if (!rt_lo) { kfree_skb(skb); err = -ENETUNREACH; goto out; } - dev = rt->dev; + dev = rt_lo->dev; skb->dev = dev; } ddp_dl->request(ddp_dl, skb, dev->dev_addr); @@ -1911,9 +1920,6 @@ static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B }; EXPORT_SYMBOL(atrtr_get_dev); EXPORT_SYMBOL(atalk_find_dev_addr); -static const char atalk_err_snap[] __initconst = - KERN_CRIT "Unable to register DDP with SNAP.\n"; - /* Called by proto.c on kernel start up */ static int __init atalk_init(void) { @@ -1928,17 +1934,23 @@ static int __init atalk_init(void) goto out_proto; ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv); - if (!ddp_dl) - printk(atalk_err_snap); + if (!ddp_dl) { + pr_crit("Unable to register DDP with SNAP.\n"); + rc = -ENOMEM; + goto out_sock; + } dev_add_pack(<alk_packet_type); dev_add_pack(&ppptalk_packet_type); rc = register_netdevice_notifier(&ddp_notifier); if (rc) - goto out_sock; + goto out_snap; + + rc = aarp_proto_init(); + if (rc) + goto out_dev; - aarp_proto_init(); rc = atalk_proc_init(); if (rc) goto out_aarp; @@ -1952,11 +1964,13 @@ out_proc: atalk_proc_exit(); out_aarp: aarp_cleanup_module(); +out_dev: unregister_netdevice_notifier(&ddp_notifier); -out_sock: +out_snap: dev_remove_pack(&ppptalk_packet_type); dev_remove_pack(<alk_packet_type); unregister_snap_client(ddp_dl); +out_sock: sock_unregister(PF_APPLETALK); out_proto: proto_unregister(&ddp_proto); diff --git a/net/atm/lec.c b/net/atm/lec.c index e4afac94ff15..a38680e19443 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1290,6 +1290,12 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry) entry->vcc = NULL; } if (entry->recv_vcc) { + struct atm_vcc *vcc = entry->recv_vcc; + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + + kfree(vpriv); + vcc->user_back = NULL; + entry->recv_vcc->push = entry->old_recv_push; vcc_release_async(entry->recv_vcc, -EPIPE); entry->recv_vcc = NULL; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index de55a3f001dc..f4c8567e91b3 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -88,8 +88,10 @@ static void ax25_kill_by_device(struct net_device *dev) again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { - s->ax25_dev = NULL; spin_unlock_bh(&ax25_list_lock); + lock_sock(s->sk); + s->ax25_dev = NULL; + release_sock(s->sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); @@ -639,8 +641,10 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; case SO_BINDTODEVICE: - if (optlen > IFNAMSIZ) - optlen = IFNAMSIZ; + if (optlen > IFNAMSIZ - 1) + optlen = IFNAMSIZ - 1; + + memset(devname, 0, sizeof(devname)); if (copy_from_user(devname, optval, optlen)) { res = -EFAULT; @@ -1189,7 +1193,10 @@ static int __must_check ax25_connect(struct socket *sock, if (addr_len > sizeof(struct sockaddr_ax25) && fsa->fsa_ax25.sax25_ndigis != 0) { /* Valid number of digipeaters ? */ - if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) { + if (fsa->fsa_ax25.sax25_ndigis < 1 || + fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS || + addr_len < sizeof(struct sockaddr_ax25) + + sizeof(ax25_address) * fsa->fsa_ax25.sax25_ndigis) { err = -EINVAL; goto out_release; } @@ -1508,7 +1515,10 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax; /* Valid number of digipeaters ? */ - if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) { + if (usax->sax25_ndigis < 1 || + usax->sax25_ndigis > AX25_MAX_DIGIS || + addr_len < sizeof(struct sockaddr_ax25) + + sizeof(ax25_address) * usax->sax25_ndigis) { err = -EINVAL; goto out; } diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 912d9c36fb1c..6f8d2fe114f6 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -135,7 +135,7 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) * Returns 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, - int max_if_num) + unsigned int max_if_num) { void *data_ptr; size_t old_size; @@ -155,10 +155,8 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, orig_node->bat_iv.bcast_own = data_ptr; data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) { - kfree(orig_node->bat_iv.bcast_own); + if (!data_ptr) goto unlock; - } memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, (max_if_num - 1) * sizeof(u8)); @@ -183,9 +181,11 @@ unlock: * Returns 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num) + unsigned int max_if_num, + unsigned int del_if_num) { - int chunk_size, ret = -ENOMEM, if_offset; + int ret = -ENOMEM; + size_t chunk_size, if_offset; void *data_ptr = NULL; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); @@ -203,8 +203,9 @@ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, memcpy(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); /* copy second part */ + if_offset = (del_if_num + 1) * chunk_size; memcpy((char *)data_ptr + del_if_num * chunk_size, - orig_node->bat_iv.bcast_own + ((del_if_num + 1) * chunk_size), + (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, (max_if_num - del_if_num) * chunk_size); free_bcast_own: @@ -252,7 +253,8 @@ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; - int size, hash_added; + int hash_added; + size_t size; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) @@ -314,14 +316,18 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) unsigned char *ogm_buff; u32 random_seqno; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno); hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); - if (!ogm_buff) + if (!ogm_buff) { + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return -ENOMEM; + } hard_iface->bat_iv.ogm_buff = ogm_buff; @@ -333,36 +339,60 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); + return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + kfree(hard_iface->bat_iv.ogm_buff); hard_iface->bat_iv.ogm_buff = NULL; + + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; - unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; + void *ogm_buff; - batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + + ogm_buff = hard_iface->bat_iv.ogm_buff; + if (!ogm_buff) + goto unlock; + + batadv_ogm_packet = ogm_buff; ether_addr_copy(batadv_ogm_packet->orig, hard_iface->net_dev->dev_addr); ether_addr_copy(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr); + +unlock: + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; - unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; + void *ogm_buff; - batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + + ogm_buff = hard_iface->bat_iv.ogm_buff; + if (!ogm_buff) + goto unlock; + + batadv_ogm_packet = ogm_buff; batadv_ogm_packet->flags = BATADV_PRIMARIES_FIRST_HOP; batadv_ogm_packet->ttl = BATADV_TTL; + +unlock: + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /* when do we schedule our own ogm to be sent */ @@ -395,14 +425,19 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) return new_tq; } -/* is there another aggregated packet here? */ -static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, - __be16 tvlv_len) +static bool +batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, + const struct batadv_ogm_packet *ogm_packet) { int next_buff_pos = 0; - next_buff_pos += buff_pos + BATADV_OGM_HLEN; - next_buff_pos += ntohs(tvlv_len); + /* check if there is enough space for the header */ + next_buff_pos += buff_pos + sizeof(*ogm_packet); + if (next_buff_pos > packet_len) + return false; + + /* check if there is enough space for the optional TVLV */ + next_buff_pos += ntohs(ogm_packet->tvlv_len); return (next_buff_pos <= packet_len) && (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); @@ -430,7 +465,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, /* adjust all flags and log packets */ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len, - batadv_ogm_packet->tvlv_len)) { + batadv_ogm_packet)) { /* we might have aggregated direct link packets with an * ordinary base packet */ @@ -491,8 +526,10 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) if (WARN_ON(!forw_packet->if_outgoing)) goto out; - if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface)) + if (forw_packet->if_outgoing->soft_iface != soft_iface) { + pr_warn("%s: soft interface switch for queued OGM\n", __func__); goto out; + } if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) goto out; @@ -871,7 +908,7 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) u32 i; size_t word_index; u8 *w; - int if_num; + unsigned int if_num; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -892,7 +929,11 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) } } -static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) +/** + * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer + * @hard_iface: interface whose ogm buffer should be transmitted + */ +static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; @@ -903,6 +944,12 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; + lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); + + /* interface already disabled by batadv_iv_ogm_iface_disable */ + if (!*ogm_buff) + return; + primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { @@ -954,6 +1001,17 @@ out: batadv_hardif_free_ref(primary_if); } +static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) +{ + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) + return; + + mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); + batadv_iv_ogm_schedule_buff(hard_iface); + mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); +} + /** * batadv_iv_ogm_orig_update - use OGM to update corresponding data in an * originator @@ -982,7 +1040,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_node_tmp; - int if_num; + unsigned int if_num; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; @@ -1140,9 +1198,10 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; - int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0; + int if_num, ret = 0; + unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; - int tq_iface_penalty; + unsigned int tq_iface_penalty; /* find corresponding one hop neighbor */ rcu_read_lock(); @@ -1179,7 +1238,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); if_num = if_incoming->if_num; orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); @@ -1189,7 +1248,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, } else { neigh_rq_count = 0; } - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) @@ -1646,9 +1705,9 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (is_my_orig) { unsigned long *word; - int offset; + size_t offset; s32 bit_pos; - s16 if_num; + unsigned int if_num; u8 *weight; orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, @@ -1748,7 +1807,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, /* unpack the aggregated packets and process them one by one */ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb), - ogm_packet->tvlv_len)) { + ogm_packet)) { batadv_iv_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM_HLEN; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index c5208136e3fc..355a18d373e6 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -73,11 +73,12 @@ static inline u32 batadv_choose_claim(const void *data, u32 size) /* return the index of the backbone gateway */ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { - const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; + const struct batadv_bla_backbone_gw *gw; u32 hash = 0; - hash = jhash(&claim->addr, sizeof(claim->addr), hash); - hash = jhash(&claim->vid, sizeof(claim->vid), hash); + gw = (struct batadv_bla_backbone_gw *)data; + hash = jhash(&gw->orig, sizeof(gw->orig), hash); + hash = jhash(&gw->vid, sizeof(gw->vid), hash); return hash % size; } @@ -129,7 +130,19 @@ batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) /* finally deinitialize the claim */ static void batadv_claim_release(struct batadv_bla_claim *claim) { - batadv_backbone_gw_free_ref(claim->backbone_gw); + struct batadv_bla_backbone_gw *old_backbone_gw; + + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; + claim->backbone_gw = NULL; + spin_unlock_bh(&claim->backbone_lock); + + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + + batadv_backbone_gw_free_ref(old_backbone_gw); + kfree_rcu(claim, rcu); } @@ -256,7 +269,9 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) } /* all claims gone, initialize CRC */ + spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc = BATADV_BLA_CRC_INIT; + spin_unlock_bh(&backbone_gw->crc_lock); } /** @@ -352,9 +367,12 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, break; } - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb) + goto out; + } skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, soft_iface); @@ -363,7 +381,10 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, skb->len + ETH_HLEN); soft_iface->last_rx = jiffies; - netif_rx(skb); + if (in_interrupt()) + netif_rx(skb); + else + netif_rx_ni(skb); out: if (primary_if) batadv_hardif_free_ref(primary_if); @@ -404,6 +425,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, entry->lasttime = jiffies; entry->crc = BATADV_BLA_CRC_INIT; entry->bat_priv = bat_priv; + spin_lock_init(&entry->crc_lock); atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); @@ -553,7 +575,9 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, __be16 crc; memcpy(mac, batadv_announce_mac, 4); + spin_lock_bh(&backbone_gw->crc_lock); crc = htons(backbone_gw->crc); + spin_unlock_bh(&backbone_gw->crc_lock); memcpy(&mac[4], &crc, 2); batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid, @@ -571,8 +595,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { + struct batadv_bla_backbone_gw *old_backbone_gw; struct batadv_bla_claim *claim; struct batadv_bla_claim search_claim; + bool remove_crc = false; int hash_added; ether_addr_copy(search_claim.addr, mac); @@ -586,8 +612,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, return; ether_addr_copy(claim->addr, mac); + spin_lock_init(&claim->backbone_lock); claim->vid = vid; claim->lasttime = jiffies; + atomic_inc(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; atomic_set(&claim->refcount, 2); @@ -614,20 +642,55 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, "bla_add_claim(): changing ownership for %pM, vid %d\n", mac, BATADV_PRINT_VID(vid)); - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); - batadv_backbone_gw_free_ref(claim->backbone_gw); + remove_crc = true; } - /* set (new) backbone gw */ + + /* replace backbone_gw atomically and adjust reference counters */ + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; atomic_inc(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; + spin_unlock_bh(&claim->backbone_lock); + + if (remove_crc) { + /* remove claim address from old backbone_gw */ + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + } + + batadv_backbone_gw_free_ref(old_backbone_gw); + /* add claim address to new backbone_gw */ + spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&backbone_gw->crc_lock); backbone_gw->lasttime = jiffies; claim_free_ref: batadv_claim_free_ref(claim); } +/** + * batadv_bla_claim_get_backbone_gw - Get valid reference for backbone_gw of + * claim + * @claim: claim whose backbone_gw should be returned + * + * Return: valid reference to claim::backbone_gw + */ +static struct batadv_bla_backbone_gw * +batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) +{ + struct batadv_bla_backbone_gw *backbone_gw; + + spin_lock_bh(&claim->backbone_lock); + backbone_gw = claim->backbone_gw; + atomic_inc(&backbone_gw->refcount); + spin_unlock_bh(&claim->backbone_lock); + + return backbone_gw; +} + /* Delete a claim from the claim hash which has the * given mac address and vid. */ @@ -635,6 +698,8 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; + struct batadv_bla_claim *claim_removed_entry; + struct hlist_node *claim_removed_node; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; @@ -645,12 +710,18 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_del_claim(): %pM, vid %d\n", mac, BATADV_PRINT_VID(vid)); - batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, - batadv_choose_claim, claim); - batadv_claim_free_ref(claim); /* reference from the hash is gone */ + claim_removed_node = batadv_hash_remove(bat_priv->bla.claim_hash, + batadv_compare_claim, + batadv_choose_claim, claim); + if (!claim_removed_node) + goto free_claim; - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + /* reference from the hash is gone */ + claim_removed_entry = hlist_entry(claim_removed_node, + struct batadv_bla_claim, hash_entry); + batadv_claim_free_ref(claim_removed_entry); +free_claim: /* don't need the reference from hash_find() anymore */ batadv_claim_free_ref(claim); } @@ -660,7 +731,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; - u16 crc; + u16 backbone_crc, crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) return 0; @@ -679,12 +750,16 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, "handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", BATADV_PRINT_VID(vid), backbone_gw->orig, crc); - if (backbone_gw->crc != crc) { + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); + + if (backbone_crc != crc) { batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "handle_announce(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", backbone_gw->orig, BATADV_PRINT_VID(backbone_gw->vid), - backbone_gw->crc, crc); + backbone_crc, crc); batadv_bla_send_request(backbone_gw); } else { @@ -1056,6 +1131,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; @@ -1070,14 +1146,17 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; - if (!batadv_compare_eth(claim->backbone_gw->orig, + + if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) - continue; + goto skip; + if (!batadv_has_timed_out(claim->lasttime, BATADV_BLA_CLAIM_TIMEOUT)) - continue; + goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_purge_claims(): %pM, vid %d, time out\n", @@ -1085,8 +1164,10 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, purge_now: batadv_handle_unclaim(bat_priv, primary_if, - claim->backbone_gw->orig, + backbone_gw->orig, claim->addr, claim->vid); +skip: + batadv_backbone_gw_free_ref(backbone_gw); } rcu_read_unlock(); } @@ -1265,10 +1346,14 @@ int batadv_bla_init(struct batadv_priv *bat_priv) return 0; bat_priv->bla.claim_hash = batadv_hash_new(128); - bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.claim_hash) + return -ENOMEM; - if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash) + bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.backbone_hash) { + batadv_hash_destroy(bat_priv->bla.claim_hash); return -ENOMEM; + } batadv_hash_set_lock_class(bat_priv->bla.claim_hash, &batadv_claim_hash_lock_class_key); @@ -1285,31 +1370,32 @@ int batadv_bla_init(struct batadv_priv *bat_priv) } /** - * batadv_bla_check_bcast_duplist + * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information - * @skb: contains the bcast_packet to be checked + * @skb: contains the multicast packet to be checked + * @payload_ptr: pointer to position inside the head buffer of the skb + * marking the start of the data to be CRC'ed + * @orig: originator mac address, NULL if unknown * - * check if it is on our broadcast list. Another gateway might - * have sent the same packet because it is connected to the same backbone, - * so we have to remove this duplicate. + * Check if it is on our broadcast list. Another gateway might have sent the + * same packet because it is connected to the same backbone, so we have to + * remove this duplicate. * * This is performed by checking the CRC, which will tell us * with a good chance that it is the same packet. If it is furthermore * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. */ -int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, - struct sk_buff *skb) +static int batadv_bla_check_duplist(struct batadv_priv *bat_priv, + struct sk_buff *skb, u8 *payload_ptr, + const u8 *orig) { int i, curr, ret = 0; __be32 crc; - struct batadv_bcast_packet *bcast_packet; struct batadv_bcast_duplist_entry *entry; - bcast_packet = (struct batadv_bcast_packet *)skb->data; - /* calculate the crc ... */ - crc = batadv_skb_crc32(skb, (u8 *)(bcast_packet + 1)); + crc = batadv_skb_crc32(skb, payload_ptr); spin_lock_bh(&bat_priv->bla.bcast_duplist_lock); @@ -1328,8 +1414,21 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, if (entry->crc != crc) continue; - if (batadv_compare_eth(entry->orig, bcast_packet->orig)) - continue; + /* are the originators both known and not anonymous? */ + if (orig && !is_zero_ether_addr(orig) && + !is_zero_ether_addr(entry->orig)) { + /* If known, check if the new frame came from + * the same originator: + * We are safe to take identical frames from the + * same orig, if known, as multiplications in + * the mesh are detected via the (orig, seqno) pair. + * So we can be a bit more liberal here and allow + * identical frames from the same orig which the source + * host might have sent multiple times on purpose. + */ + if (batadv_compare_eth(entry->orig, orig)) + continue; + } /* this entry seems to match: same crc, not too old, * and from another gw. therefore return 1 to forbid it. @@ -1345,7 +1444,14 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, entry = &bat_priv->bla.bcast_duplist[curr]; entry->crc = crc; entry->entrytime = jiffies; - ether_addr_copy(entry->orig, bcast_packet->orig); + + /* known originator */ + if (orig) + ether_addr_copy(entry->orig, orig); + /* anonymous originator */ + else + eth_zero_addr(entry->orig); + bat_priv->bla.bcast_duplist_curr = curr; out: @@ -1355,6 +1461,48 @@ out: } /** + * batadv_bla_check_ucast_duplist() - Check if a frame is in the broadcast dup. + * @bat_priv: the bat priv with all the soft interface information + * @skb: contains the multicast packet to be checked, decapsulated from a + * unicast_packet + * + * Check if it is on our broadcast list. Another gateway might have sent the + * same packet because it is connected to the same backbone, so we have to + * remove this duplicate. + * + * Return: true if a packet is in the duplicate list, false otherwise. + */ +static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + return batadv_bla_check_duplist(bat_priv, skb, (u8 *)skb->data, NULL); +} + +/** + * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup. + * @bat_priv: the bat priv with all the soft interface information + * @skb: contains the bcast_packet to be checked + * + * Check if it is on our broadcast list. Another gateway might have sent the + * same packet because it is connected to the same backbone, so we have to + * remove this duplicate. + * + * Return: true if a packet is in the duplicate list, false otherwise. + */ +int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + struct batadv_bcast_packet *bcast_packet; + u8 *payload_ptr; + + bcast_packet = (struct batadv_bcast_packet *)skb->data; + payload_ptr = (u8 *)(bcast_packet + 1); + + return batadv_bla_check_duplist(bat_priv, skb, payload_ptr, + bcast_packet->orig); +} + +/** * batadv_bla_is_backbone_gw_orig * @bat_priv: the bat priv with all the soft interface information * @orig: originator mac address @@ -1457,7 +1605,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame - * @is_bcast: the packet came in a broadcast packet type. + * @packet_type: the batman packet type this frame came in * * bla_rx avoidance checks if: * * we have to race for a claim @@ -1468,11 +1616,13 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * process the skb. */ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid, bool is_bcast) + unsigned short vid, int packet_type) { + struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; + bool own_claim; int ret; ethhdr = eth_hdr(skb); @@ -1485,9 +1635,32 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, goto allow; if (unlikely(atomic_read(&bat_priv->bla.num_requests))) - /* don't allow broadcasts while requests are in flight */ - if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) - goto handled; + /* don't allow multicast packets while requests are in flight */ + if (is_multicast_ether_addr(ethhdr->h_dest)) + /* Both broadcast flooding or multicast-via-unicasts + * delivery might send to multiple backbone gateways + * sharing the same LAN and therefore need to coordinate + * which backbone gateway forwards into the LAN, + * by claiming the payload source address. + * + * Broadcast flooding and multicast-via-unicasts + * delivery use the following two batman packet types. + * Note: explicitly exclude BATADV_UNICAST_4ADDR, + * as the DHCP gateway feature will send explicitly + * to only one BLA gateway, so the claiming process + * should be avoided there. + */ + if (packet_type == BATADV_BCAST || + packet_type == BATADV_UNICAST) + goto handled; + + /* potential duplicates from foreign BLA backbone gateways via + * multicast-in-unicast packets + */ + if (is_multicast_ether_addr(ethhdr->h_dest) && + packet_type == BATADV_UNICAST && + batadv_bla_check_ucast_duplist(bat_priv, skb)) + goto handled; ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; @@ -1504,20 +1677,25 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, } /* if it is our own claim ... */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + own_claim = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_free_ref(backbone_gw); + + if (own_claim) { /* ... allow it in any case */ claim->lasttime = jiffies; goto allow; } - /* if it is a broadcast ... */ - if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) { + /* if it is a multicast ... */ + if (is_multicast_ether_addr(ethhdr->h_dest) && + (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST)) { /* ... drop it. the responsible gateway is in charge. * - * We need to check is_bcast because with the gateway + * We need to check packet type because with the gateway * feature, broadcasts (like DHCP requests) may be sent - * using a unicast packet type. + * using a unicast 4 address packet type. See comment above. */ goto handled; } else { @@ -1568,7 +1746,9 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; + bool client_roamed; int ret = 0; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -1598,8 +1778,12 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, goto allow; /* check if we are responsible. */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + client_roamed = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_free_ref(backbone_gw); + + if (client_roamed) { /* if yes, the client has roamed and we have * to unclaim it. */ @@ -1652,9 +1836,11 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->bla.claim_hash; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; struct hlist_head *head; + u16 backbone_crc; u32 i; bool is_own; u8 *primary_addr; @@ -1675,13 +1861,21 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { - is_own = batadv_compare_eth(claim->backbone_gw->orig, + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); + + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", claim->addr, BATADV_PRINT_VID(claim->vid), - claim->backbone_gw->orig, + backbone_gw->orig, (is_own ? 'x' : ' '), - claim->backbone_gw->crc); + backbone_crc); + + batadv_backbone_gw_free_ref(backbone_gw); } rcu_read_unlock(); } @@ -1700,6 +1894,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) struct batadv_hard_iface *primary_if; struct hlist_head *head; int secs, msecs; + u16 backbone_crc; u32 i; bool is_own; u8 *primary_addr; @@ -1730,10 +1925,14 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) if (is_own) continue; + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); + seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n", backbone_gw->orig, BATADV_PRINT_VID(backbone_gw->vid), secs, - msecs, backbone_gw->crc); + msecs, backbone_crc); } rcu_read_unlock(); } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 025152b34282..d1553c46df8c 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -27,7 +27,7 @@ struct sk_buff; #ifdef CONFIG_BATMAN_ADV_BLA int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid, bool is_bcast); + unsigned short vid, int packet_type); int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid); int batadv_bla_is_backbone_gw(struct sk_buff *skb, @@ -50,7 +50,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv); static inline int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, - bool is_bcast) + int packet_type) { return 0; } diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index c4c1e8030ba0..b905763dc2e7 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -19,6 +19,7 @@ #include "main.h" #include <linux/compiler.h> +#include <linux/dcache.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/errno.h> @@ -213,6 +214,7 @@ static const struct file_operations batadv_log_fops = { .read = batadv_log_read, .poll = batadv_log_poll, .llseek = no_llseek, + .owner = THIS_MODULE, }; static int batadv_debug_log_setup(struct batadv_priv *bat_priv) @@ -507,6 +509,25 @@ out: } /** + * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif + * @hard_iface: hard interface which was renamed + */ +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) +{ + const char *name = hard_iface->net_dev->name; + struct dentry *dir; + struct dentry *d; + + dir = hard_iface->debug_dir; + if (!dir) + return; + + d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); + if (!d) + pr_err("Can't rename debugfs dir to %s\n", name); +} + +/** * batadv_debugfs_del_hardif - delete the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which is deleted. @@ -561,6 +582,26 @@ out: return -ENOMEM; } +/** + * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif + * @dev: net_device which was renamed + */ +void batadv_debugfs_rename_meshif(struct net_device *dev) +{ + struct batadv_priv *bat_priv = netdev_priv(dev); + const char *name = dev->name; + struct dentry *dir; + struct dentry *d; + + dir = bat_priv->debug_dir; + if (!dir) + return; + + d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); + if (!d) + pr_err("Can't rename debugfs dir to %s\n", name); +} + void batadv_debugfs_del_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 80ab8d6f0ab3..347f793a18b2 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -31,8 +31,10 @@ struct net_device; void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); +void batadv_debugfs_rename_meshif(struct net_device *dev); void batadv_debugfs_del_meshif(struct net_device *dev); int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); #else @@ -50,6 +52,10 @@ static inline int batadv_debugfs_add_meshif(struct net_device *dev) return 0; } +static inline void batadv_debugfs_rename_meshif(struct net_device *dev) +{ +} + static inline void batadv_debugfs_del_meshif(struct net_device *dev) { } @@ -61,6 +67,11 @@ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) } static inline +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) +{ +} + +static inline void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) { } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index c2dff7c6e960..769683da8d9d 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -226,6 +226,7 @@ static u32 batadv_hash_dat(const void *data, u32 size) u32 hash = 0; const struct batadv_dat_entry *dat = data; const unsigned char *key; + __be16 vid; u32 i; key = (const unsigned char *)&dat->ip; @@ -235,7 +236,8 @@ static u32 batadv_hash_dat(const void *data, u32 size) hash ^= (hash >> 6); } - key = (const unsigned char *)&dat->vid; + vid = htons(dat->vid); + key = (__force const unsigned char *)&vid; for (i = 0; i < sizeof(dat->vid); i++) { hash += key[i]; hash += (hash << 10); @@ -991,15 +993,19 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, if (!skb_new) goto out; - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } skb_reset_mac_header(skb_new); skb_new->protocol = eth_type_trans(skb_new, bat_priv->soft_iface); - bat_priv->stats.rx_packets++; - bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size; + batadv_inc_counter(bat_priv, BATADV_CNT_RX); + batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, + skb->len + ETH_HLEN + hdr_size); bat_priv->soft_iface->last_rx = jiffies; netif_rx(skb_new); @@ -1071,9 +1077,12 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, */ skb_reset_mac_header(skb_new); - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } /* To preserve backwards compatibility, the node has choose the outgoing * format based on the incoming request packet type. The assumption is diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index d50c3b003dc9..371f50804fc2 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -233,8 +233,10 @@ err_unlock: spin_unlock_bh(&chain->lock); err: - if (!ret) + if (!ret) { kfree(frag_entry_new); + kfree_skb(skb); + } return ret; } @@ -329,9 +331,9 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, goto out_err; out: - *skb = skb_out; ret = true; out_err: + *skb = skb_out; return ret; } @@ -392,9 +394,10 @@ out: /** * batadv_frag_create - create a fragment from skb + * @net_dev: outgoing device for fragment * @skb: skb to create fragment from * @frag_head: header to use in new fragment - * @mtu: size of new fragment + * @fragment_size: size of new fragment * * Split the passed skb into two fragments: A new one with size matching the * passed mtu and the old one with the rest. The new skb contains data from the @@ -402,22 +405,25 @@ out: * * Returns the new fragment, NULL on error. */ -static struct sk_buff *batadv_frag_create(struct sk_buff *skb, +static struct sk_buff *batadv_frag_create(struct net_device *net_dev, + struct sk_buff *skb, struct batadv_frag_packet *frag_head, - unsigned int mtu) + unsigned int fragment_size) { + unsigned int ll_reserved = LL_RESERVED_SPACE(net_dev); + unsigned int tailroom = net_dev->needed_tailroom; struct sk_buff *skb_fragment; unsigned header_size = sizeof(*frag_head); - unsigned fragment_size = mtu - header_size; + unsigned mtu = fragment_size + header_size; - skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); + skb_fragment = dev_alloc_skb(ll_reserved + mtu + tailroom); if (!skb_fragment) goto err; skb->priority = TC_PRIO_CONTROL; /* Eat the last mtu-bytes of the skb */ - skb_reserve(skb_fragment, header_size + ETH_HLEN); + skb_reserve(skb_fragment, ll_reserved + header_size); skb_split(skb, skb_fragment, skb->len - fragment_size); /* Add the header */ @@ -440,13 +446,14 @@ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node) { + struct net_device *net_dev = neigh_node->if_incoming->net_dev; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; - unsigned mtu = neigh_node->if_incoming->net_dev->mtu; + unsigned mtu = net_dev->mtu; unsigned header_size = sizeof(frag_header); - unsigned max_fragment_size, max_packet_size; + unsigned max_fragment_size, num_fragments; bool ret = false; /* To avoid merge and refragmentation at next-hops we never send @@ -454,10 +461,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb, */ mtu = min_t(unsigned, mtu, BATADV_FRAG_MAX_FRAG_SIZE); max_fragment_size = mtu - header_size; - max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; + + if (skb->len == 0 || max_fragment_size == 0) + goto out_err; + + num_fragments = (skb->len - 1) / max_fragment_size + 1; + max_fragment_size = (skb->len - 1) / num_fragments + 1; /* Don't even try to fragment, if we need more than 16 fragments */ - if (skb->len > max_packet_size) + if (num_fragments > BATADV_FRAG_MAX_FRAGMENTS) goto out_err; bat_priv = orig_node->bat_priv; @@ -478,7 +490,12 @@ bool batadv_frag_send_packet(struct sk_buff *skb, /* Eat and send fragments from the tail of skb */ while (skb->len > max_fragment_size) { - skb_fragment = batadv_frag_create(skb, &frag_header, mtu); + /* The initial check in this function should cover this case */ + if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) + goto out_err; + + skb_fragment = batadv_frag_create(net_dev, skb, &frag_header, + max_fragment_size); if (!skb_fragment) goto out_err; @@ -488,17 +505,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb, batadv_send_skb_packet(skb_fragment, neigh_node->if_incoming, neigh_node->addr); frag_header.no++; - - /* The initial check in this function should cover this case */ - if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) - goto out_err; } - /* Make room for the fragment header. */ - if (batadv_skb_head_push(skb, header_size) < 0 || - pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) + /* make sure that there is at least enough head for the fragmentation + * and ethernet headers + */ + if (skb_cow_head(skb, ETH_HLEN + header_size) < 0) goto out_err; + skb_push(skb, header_size); memcpy(skb->data, &frag_header, header_size); /* Send the last fragment */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 6abfba1e227f..5fdb88f72b68 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -29,6 +29,7 @@ #include <linux/ipv6.h> #include <linux/kernel.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -413,6 +414,9 @@ out: * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information + * + * Has to be called with the appropriate locks being acquired + * (gw.list_lock). */ static void batadv_gw_node_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -420,6 +424,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node; + lockdep_assert_held(&bat_priv->gw.list_lock); + if (gateway->bandwidth_down == 0) return; @@ -438,9 +444,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); atomic_set(&gw_node->refcount, 1); - spin_lock_bh(&bat_priv->gw.list_lock); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); - spin_unlock_bh(&bat_priv->gw.list_lock); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", @@ -493,11 +497,14 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node, *curr_gw = NULL; + spin_lock_bh(&bat_priv->gw.list_lock); gw_node = batadv_gw_node_get(bat_priv, orig_node); if (!gw_node) { batadv_gw_node_add(bat_priv, orig_node, gateway); + spin_unlock_bh(&bat_priv->gw.list_lock); goto out; } + spin_unlock_bh(&bat_priv->gw.list_lock); if ((gw_node->bandwidth_down == ntohl(gateway->bandwidth_down)) && (gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))) @@ -527,11 +534,12 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); - hlist_del_init_rcu(&gw_node->list); + if (!hlist_unhashed(&gw_node->list)) { + hlist_del_init_rcu(&gw_node->list); + batadv_gw_node_free_ref(gw_node); + } spin_unlock_bh(&bat_priv->gw.list_lock); - batadv_gw_node_free_ref(gw_node); - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); @@ -749,8 +757,10 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET; /* store the client address if the message is going to a client */ - if (ret == BATADV_DHCP_TO_CLIENT && - pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) { + if (ret == BATADV_DHCP_TO_CLIENT) { + if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) + return BATADV_DHCP_NO; + /* check if the DHCP packet carries an Ethernet DHCP */ p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET; if (*p != BATADV_DHCP_HTYPE_ETHERNET) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3c8d8142e8c6..0bd7c9e6c9a0 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -26,6 +26,7 @@ #include <linux/if.h> #include <linux/kernel.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> @@ -45,13 +46,16 @@ #include "sysfs.h" #include "translation-table.h" -void batadv_hardif_free_rcu(struct rcu_head *rcu) +/** + * batadv_hardif_release - release hard interface from lists and queue for + * free after rcu grace period + * @hard_iface: the hard interface to free + */ +void batadv_hardif_release(struct batadv_hard_iface *hard_iface) { - struct batadv_hard_iface *hard_iface; - - hard_iface = container_of(rcu, struct batadv_hard_iface, rcu); dev_put(hard_iface->net_dev); - kfree(hard_iface); + + kfree_rcu(hard_iface, rcu); } struct batadv_hard_iface * @@ -74,6 +78,28 @@ out: } /** + * batadv_mutual_parents - check if two devices are each others parent + * @dev1: 1st net_device + * @dev2: 2nd net_device + * + * veth devices come in pairs and each is the parent of the other! + * + * Return: true if the devices are each others parent, otherwise false + */ +static bool batadv_mutual_parents(const struct net_device *dev1, + const struct net_device *dev2) +{ + int dev1_parent_iflink = dev_get_iflink(dev1); + int dev2_parent_iflink = dev_get_iflink(dev2); + + if (!dev1_parent_iflink || !dev2_parent_iflink) + return false; + + return (dev1_parent_iflink == dev2->ifindex) && + (dev2_parent_iflink == dev1->ifindex); +} + +/** * batadv_is_on_batman_iface - check if a device is a batman iface descendant * @net_dev: the device to check * @@ -108,6 +134,9 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return false; } + if (batadv_mutual_parents(net_dev, parent_dev)) + return false; + ret = batadv_is_on_batman_iface(parent_dev); return ret; @@ -287,6 +316,9 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); needed_headroom += batadv_max_header_len(); + /* fragmentation headers don't strip the unicast/... header */ + needed_headroom += sizeof(struct batadv_frag_packet); + soft_iface->needed_headroom = needed_headroom; soft_iface->needed_tailroom = lower_tailroom; } @@ -465,6 +497,11 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); + if (bat_priv->num_ifaces >= UINT_MAX) { + ret = -ENOSPC; + goto err_dev; + } + ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface); if (ret) goto err_dev; @@ -537,8 +574,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; - if (hard_iface->if_status == BATADV_IF_ACTIVE) - batadv_hardif_deactivate_interface(hard_iface); + batadv_hardif_deactivate_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; @@ -573,7 +609,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ - if (!bat_priv->num_ifaces) { + if (bat_priv->num_ifaces == 0) { batadv_gw_check_client_stop(bat_priv); if (autodel == BATADV_IF_CLEANUP_AUTO) @@ -629,7 +665,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (ret) goto free_if; - hard_iface->if_num = -1; + hard_iface->if_num = 0; hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; @@ -639,6 +675,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) goto free_sysfs; INIT_LIST_HEAD(&hard_iface->list); + mutex_init(&hard_iface->bat_iv.ogm_buff_mutex); INIT_WORK(&hard_iface->cleanup_work, batadv_hardif_remove_interface_finish); @@ -693,6 +730,32 @@ void batadv_hardif_remove_interfaces(void) rtnl_unlock(); } +/** + * batadv_hard_if_event_softif() - Handle events for soft interfaces + * @event: NETDEV_* event to handle + * @net_dev: net_device which generated an event + * + * Return: NOTIFY_* result + */ +static int batadv_hard_if_event_softif(unsigned long event, + struct net_device *net_dev) +{ + struct batadv_priv *bat_priv; + + switch (event) { + case NETDEV_REGISTER: + batadv_sysfs_add_meshif(net_dev); + bat_priv = netdev_priv(net_dev); + batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); + break; + case NETDEV_CHANGENAME: + batadv_debugfs_rename_meshif(net_dev); + break; + } + + return NOTIFY_DONE; +} + static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -701,12 +764,8 @@ static int batadv_hard_if_event(struct notifier_block *this, struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; - if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) { - batadv_sysfs_add_meshif(net_dev); - bat_priv = netdev_priv(net_dev); - batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); - return NOTIFY_DONE; - } + if (batadv_softif_is_valid(net_dev)) + return batadv_hard_if_event_softif(event, net_dev); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && event == NETDEV_REGISTER) @@ -748,6 +807,9 @@ static int batadv_hard_if_event(struct notifier_block *this, if (hard_iface == primary_if) batadv_primary_if_update_addr(bat_priv, NULL); break; + case NETDEV_CHANGENAME: + batadv_debugfs_rename_hardif(hard_iface); + break; default: break; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 7b12ea8ea29d..4d74c0415911 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -61,18 +61,18 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, void batadv_hardif_remove_interfaces(void); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); -void batadv_hardif_free_rcu(struct rcu_head *rcu); +void batadv_hardif_release(struct batadv_hard_iface *hard_iface); /** * batadv_hardif_free_ref - decrement the hard interface refcounter and - * possibly free it + * possibly release it * @hard_iface: the hard interface to free */ static inline void batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) { if (atomic_dec_and_test(&hard_iface->refcount)) - call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); + batadv_hardif_release(hard_iface); } static inline struct batadv_hard_iface * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d7f17c1aa4a4..8ba7b86579d4 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -159,24 +159,34 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); ret = batadv_originator_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_orig; + } ret = batadv_tt_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_tt; + } ret = batadv_bla_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_bla; + } ret = batadv_dat_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_dat; + } ret = batadv_nc_mesh_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_nc; + } batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); @@ -186,8 +196,18 @@ int batadv_mesh_init(struct net_device *soft_iface) return 0; -err: - batadv_mesh_free(soft_iface); +err_nc: + batadv_dat_free(bat_priv); +err_dat: + batadv_bla_free(bat_priv); +err_bla: + batadv_tt_free(bat_priv); +err_tt: + batadv_originator_free(bat_priv); +err_orig: + batadv_purge_outstanding_packets(bat_priv, NULL); + atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); + return ret; } @@ -747,7 +767,7 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, struct batadv_tvlv_container *tvlv) { - lockdep_assert_held(&bat_priv->tvlv.handler_list_lock); + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); if (!tvlv) return; @@ -1079,15 +1099,20 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, { struct batadv_tvlv_handler *tvlv_handler; + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_free_ref(tvlv_handler); return; } tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); - if (!tvlv_handler) + if (!tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; + } tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; @@ -1097,7 +1122,6 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, atomic_set(&tvlv_handler->refcount, 1); INIT_HLIST_NODE(&tvlv_handler->list); - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); } diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 8aa2d65df86f..44965f71ad73 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -44,7 +44,9 @@ #include <net/addrconf.h> #include <net/ipv6.h> +#include "bridge_loop_avoidance.h" #include "packet.h" +#include "send.h" #include "translation-table.h" /** @@ -806,6 +808,35 @@ void batadv_mcast_free(struct batadv_priv *bat_priv) } /** + * batadv_mcast_forw_send_orig() - send a multicast packet to an originator + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to send + * @vid: the vlan identifier + * @orig_node: the originator to send the packet to + * + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + */ +int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + struct batadv_orig_node *orig_node) +{ + /* Avoid sending multicast-in-unicast packets to other BLA + * gateways - they already got the frame from the LAN side + * we share with them. + * TODO: Refactor to take BLA into account earlier, to avoid + * reducing the mcast_fanout count. + */ + if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) { + dev_kfree_skb(skb); + return NET_XMIT_SUCCESS; + } + + return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, + orig_node, vid); +} + +/** * batadv_mcast_purge_orig - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 8f3cb04b9f13..dd83ef07e2f2 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -44,6 +44,11 @@ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node **mcast_single_orig); +int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + struct batadv_orig_node *orig_node); + void batadv_mcast_init(struct batadv_priv *bat_priv); void batadv_mcast_free(struct batadv_priv *bat_priv); @@ -68,6 +73,16 @@ static inline int batadv_mcast_init(struct batadv_priv *bat_priv) return 0; } +static inline int +batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + struct batadv_orig_node *orig_node) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} + static inline void batadv_mcast_free(struct batadv_priv *bat_priv) { } diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index d0956f726547..9317d872b9c0 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -159,8 +159,10 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) &batadv_nc_coding_hash_lock_class_key); bat_priv->nc.decoding_hash = batadv_hash_new(128); - if (!bat_priv->nc.decoding_hash) + if (!bat_priv->nc.decoding_hash) { + batadv_hash_destroy(bat_priv->nc.coding_hash); goto err; + } batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, &batadv_nc_decoding_hash_lock_class_key); @@ -828,19 +830,29 @@ static struct batadv_nc_node spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ struct list_head *list; + /* Select ingoing or outgoing coding node */ + if (in_coding) { + lock = &orig_neigh_node->in_coding_list_lock; + list = &orig_neigh_node->in_coding_list; + } else { + lock = &orig_neigh_node->out_coding_list_lock; + list = &orig_neigh_node->out_coding_list; + } + + spin_lock_bh(lock); + /* Check if nc_node is already added */ nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); /* Node found */ if (nc_node) - return nc_node; + goto unlock; nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); if (!nc_node) - return NULL; + goto unlock; - if (!atomic_inc_not_zero(&orig_neigh_node->refcount)) - goto free; + atomic_inc(&orig_neigh_node->refcount); /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); @@ -848,28 +860,15 @@ static struct batadv_nc_node nc_node->orig_node = orig_neigh_node; atomic_set(&nc_node->refcount, 2); - /* Select ingoing or outgoing coding node */ - if (in_coding) { - lock = &orig_neigh_node->in_coding_list_lock; - list = &orig_neigh_node->in_coding_list; - } else { - lock = &orig_neigh_node->out_coding_list_lock; - list = &orig_neigh_node->out_coding_list; - } - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); /* Add nc_node to orig_node */ - spin_lock_bh(lock); list_add_tail_rcu(&nc_node->list, list); +unlock: spin_unlock_bh(lock); return nc_node; - -free: - kfree(nc_node); - return NULL; } /** @@ -994,15 +993,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, */ static u8 batadv_nc_random_weight_tq(u8 tq) { - u8 rand_val, rand_tq; - - get_random_bytes(&rand_val, sizeof(rand_val)); - /* randomize the estimated packet loss (max TQ - estimated TQ) */ - rand_tq = rand_val * (BATADV_TQ_MAX_VALUE - tq); - - /* normalize the randomized packet loss */ - rand_tq /= BATADV_TQ_MAX_VALUE; + u8 rand_tq = prandom_u32_max(BATADV_TQ_MAX_VALUE + 1 - tq); /* convert to (randomized) estimated tq again */ return BATADV_TQ_MAX_VALUE - rand_tq; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 6282f021ddfb..b3013fbc417e 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -462,6 +462,8 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, { struct batadv_neigh_node *neigh_node; + spin_lock_bh(&orig_node->neigh_list_lock); + neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); if (neigh_node) goto out; @@ -483,19 +485,20 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, ether_addr_copy(neigh_node->addr, neigh_addr); neigh_node->if_incoming = hard_iface; neigh_node->orig_node = orig_node; + neigh_node->last_seen = jiffies; /* extra reference for return */ atomic_set(&neigh_node->refcount, 2); - spin_lock_bh(&orig_node->neigh_list_lock); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); - spin_unlock_bh(&orig_node->neigh_list_lock); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); out: + spin_unlock_bh(&orig_node->neigh_list_lock); + return neigh_node; } @@ -561,6 +564,8 @@ static void batadv_orig_node_release(struct batadv_orig_node *orig_node) struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_orig_node_vlan *vlan; + struct batadv_orig_ifinfo *last_candidate; spin_lock_bh(&orig_node->neigh_list_lock); @@ -576,8 +581,21 @@ static void batadv_orig_node_release(struct batadv_orig_node *orig_node) hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_free_ref(orig_ifinfo); } + + last_candidate = orig_node->last_bonding_candidate; + orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); + if (last_candidate) + batadv_orig_ifinfo_free_ref(last_candidate); + + spin_lock_bh(&orig_node->vlan_list_lock); + hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { + hlist_del_rcu(&vlan->list); + batadv_orig_node_vlan_free_ref(vlan); + } + spin_unlock_bh(&orig_node->vlan_list_lock); + /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); @@ -1085,7 +1103,7 @@ out: } int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - int max_if_num) + unsigned int max_if_num) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; @@ -1121,7 +1139,7 @@ err: } int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - int max_if_num) + unsigned int max_if_num) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index a5c37882b409..65824d892a6a 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -67,9 +67,9 @@ void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - int max_if_num); + unsigned int max_if_num); int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - int max_if_num); + unsigned int max_if_num); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index d8a2f33e60e5..e470410abb44 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -359,6 +359,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, if (skb_cow(skb, ETH_HLEN) < 0) goto out; + ethhdr = eth_hdr(skb); icmph = (struct batadv_icmp_header *)skb->data; icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph; if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN) @@ -439,6 +440,52 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, } /** + * batadv_last_bonding_get - Get last_bonding_candidate of orig_node + * @orig_node: originator node whose last bonding candidate should be retrieved + * + * Return: last bonding candidate of router or NULL if not found + * + * The object is returned with refcounter increased by 1. + */ +static struct batadv_orig_ifinfo * +batadv_last_bonding_get(struct batadv_orig_node *orig_node) +{ + struct batadv_orig_ifinfo *last_bonding_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + last_bonding_candidate = orig_node->last_bonding_candidate; + + if (last_bonding_candidate) + atomic_inc(&last_bonding_candidate->refcount); + spin_unlock_bh(&orig_node->neigh_list_lock); + + return last_bonding_candidate; +} + +/** + * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node + * @orig_node: originator node whose bonding candidates should be replaced + * @new_candidate: new bonding candidate or NULL + */ +static void +batadv_last_bonding_replace(struct batadv_orig_node *orig_node, + struct batadv_orig_ifinfo *new_candidate) +{ + struct batadv_orig_ifinfo *old_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + old_candidate = orig_node->last_bonding_candidate; + + if (new_candidate) + atomic_inc(&new_candidate->refcount); + orig_node->last_bonding_candidate = new_candidate; + spin_unlock_bh(&orig_node->neigh_list_lock); + + if (old_candidate) + batadv_orig_ifinfo_free_ref(old_candidate); +} + +/** * batadv_find_router - find a suitable router for this originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: the destination node @@ -485,7 +532,7 @@ batadv_find_router(struct batadv_priv *bat_priv, * router - obviously there are no other candidates. */ rcu_read_lock(); - last_candidate = orig_node->last_bonding_candidate; + last_candidate = batadv_last_bonding_get(orig_node); if (last_candidate) last_cand_router = rcu_dereference(last_candidate->router); @@ -545,10 +592,6 @@ next: } rcu_read_unlock(); - /* last_bonding_candidate is reset below, remove the old reference. */ - if (orig_node->last_bonding_candidate) - batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate); - /* After finding candidates, handle the three cases: * 1) there is a next candidate, use that * 2) there is no next candidate, use the first of the list @@ -557,23 +600,33 @@ next: if (next_candidate) { batadv_neigh_node_free_ref(router); - /* remove references to first candidate, we don't need it. */ - if (first_candidate) { - batadv_neigh_node_free_ref(first_candidate_router); - batadv_orig_ifinfo_free_ref(first_candidate); - } + atomic_inc(&next_candidate_router->refcount); router = next_candidate_router; - orig_node->last_bonding_candidate = next_candidate; + batadv_last_bonding_replace(orig_node, next_candidate); } else if (first_candidate) { batadv_neigh_node_free_ref(router); - /* refcounting has already been done in the loop above. */ + atomic_inc(&first_candidate_router->refcount); router = first_candidate_router; - orig_node->last_bonding_candidate = first_candidate; + batadv_last_bonding_replace(orig_node, first_candidate); } else { - orig_node->last_bonding_candidate = NULL; + batadv_last_bonding_replace(orig_node, NULL); + } + + /* cleanup of candidates */ + if (first_candidate) { + batadv_neigh_node_free_ref(first_candidate_router); + batadv_orig_ifinfo_free_ref(first_candidate); } + if (next_candidate) { + batadv_neigh_node_free_ref(next_candidate_router); + batadv_orig_ifinfo_free_ref(next_candidate); + } + + if (last_candidate) + batadv_orig_ifinfo_free_ref(last_candidate); + return router; } @@ -585,6 +638,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr = eth_hdr(skb); int res, hdr_len, ret = NET_RX_DROP; + unsigned int len; unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -625,6 +679,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, if (hdr_len > 0) batadv_skb_set_priority(skb, hdr_len); + len = skb->len; res = batadv_send_skb_to_orig(skb, orig_node, recv_if); /* translate transmit result into receive result */ @@ -632,7 +687,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, /* skb was transmitted and consumed */ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, - skb->len + ETH_HLEN); + len + ETH_HLEN); ret = NET_RX_SUCCESS; } else if (res == NET_XMIT_POLICED) { @@ -649,6 +704,7 @@ out: /** * batadv_reroute_unicast_packet - update the unicast header for re-routing * @bat_priv: the bat priv with all the soft interface information + * @skb: unicast packet to process * @unicast_packet: the unicast header to be updated * @dst_addr: the payload destination * @vid: VLAN identifier @@ -660,7 +716,7 @@ out: * Returns true if the packet header has been updated, false otherwise */ static bool -batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, +batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_unicast_packet *unicast_packet, u8 *dst_addr, unsigned short vid) { @@ -689,8 +745,10 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, } /* update the packet header */ + skb_postpull_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); ether_addr_copy(unicast_packet->dest, orig_addr); unicast_packet->ttvn = orig_ttvn; + skb_postpush_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); ret = true; out: @@ -724,13 +782,17 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, vid = batadv_get_vid(skb, hdr_len); ethhdr = (struct ethhdr *)(skb->data + hdr_len); + /* do not reroute multicast frames in a unicast header */ + if (is_multicast_ether_addr(ethhdr->h_dest)) + return true; + /* check if the destination client was served by this node and it is now * roaming. In this case, it means that the node has got a ROAM_ADV * message and that it knows the new destination in the mesh to re-route * the packet to */ if (batadv_tt_local_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) { - if (batadv_reroute_unicast_packet(bat_priv, unicast_packet, + if (batadv_reroute_unicast_packet(bat_priv, skb, unicast_packet, ethhdr->h_dest, vid)) batadv_dbg_ratelimited(BATADV_DBG_TT, bat_priv, @@ -776,7 +838,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * destination can possibly be updated and forwarded towards the new * target host */ - if (batadv_reroute_unicast_packet(bat_priv, unicast_packet, + if (batadv_reroute_unicast_packet(bat_priv, skb, unicast_packet, ethhdr->h_dest, vid)) { batadv_dbg_ratelimited(BATADV_DBG_TT, bat_priv, "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n", @@ -799,12 +861,14 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, if (!primary_if) return 0; + /* update the packet header */ + skb_postpull_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); + unicast_packet->ttvn = curr_ttvn; + skb_postpush_rcsum(skb, unicast_packet, sizeof(*unicast_packet)); batadv_hardif_free_ref(primary_if); - unicast_packet->ttvn = curr_ttvn; - return 1; } @@ -849,7 +913,6 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, bool is4addr; unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; is4addr = unicast_packet->packet_type == BATADV_UNICAST_4ADDR; /* the caller function should have already pulled 2 bytes */ @@ -870,9 +933,13 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) return NET_RX_DROP; + unicast_packet = (struct batadv_unicast_packet *)skb->data; + /* packet for me */ if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { if (is4addr) { + unicast_4addr_packet = + (struct batadv_unicast_4addr_packet *)skb->data; subtype = unicast_4addr_packet->subtype; batadv_dat_inc_counter(bat_priv, subtype); @@ -998,6 +1065,12 @@ int batadv_recv_frag_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_RX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_RX_BYTES, skb->len); + /* batadv_frag_skb_buffer will always consume the skb and + * the caller should therefore never try to free the + * skb after this point + */ + ret = NET_RX_SUCCESS; + /* Add fragment to buffer and merge if possible. */ if (!batadv_frag_skb_buffer(&skb, orig_node_src)) goto out; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 0e0c3b8ed927..11fbfb222c49 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -381,8 +381,8 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig_node; orig_node = batadv_gw_get_selected_orig(bat_priv); - return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, - orig_node, vid); + return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, + BATADV_P_DATA, orig_node, vid); } void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 4812123e0a2c..5105e860d3aa 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -353,9 +353,8 @@ send: goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (mcast_single_orig) { - ret = batadv_send_skb_unicast(bat_priv, skb, - BATADV_UNICAST, 0, - mcast_single_orig, vid); + ret = batadv_mcast_forw_send_orig(bat_priv, skb, vid, + mcast_single_orig); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) @@ -394,10 +393,10 @@ void batadv_interface_rx(struct net_device *soft_iface, struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; - bool is_bcast; + int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; - is_bcast = (batadv_bcast_packet->packet_type == BATADV_BCAST); + packet_type = batadv_bcast_packet->packet_type; /* check if enough space is available for pulling, and pull */ if (!pskb_may_pull(skb, hdr_size)) @@ -445,7 +444,7 @@ void batadv_interface_rx(struct net_device *soft_iface, /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ - if (batadv_bla_rx(bat_priv, skb, vid, is_bcast)) + if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) @@ -539,15 +538,20 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) struct batadv_softif_vlan *vlan; int err; + spin_lock_bh(&bat_priv->softif_vlan_list_lock); + vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_free_ref(vlan); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); - if (!vlan) + if (!vlan) { + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; + } vlan->bat_priv = bat_priv; vlan->vid = vid; @@ -555,16 +559,19 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) atomic_set(&vlan->ap_isolation, 0); + hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + + /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the + * sleeping behavior of the sysfs functions and the fs_reclaim lock + */ err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); if (err) { - kfree(vlan); + /* ref for the list */ + batadv_softif_vlan_free_ref(vlan); return err; } - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); - /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ @@ -1000,7 +1007,9 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface) static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { + struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; + struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) @@ -1008,6 +1017,13 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, BATADV_IF_CLEANUP_KEEP); } + /* destroy the "untagged" VLAN */ + vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + if (vlan) { + batadv_softif_destroy_vlan(bat_priv, vlan); + batadv_softif_vlan_free_ref(vlan); + } + batadv_sysfs_del_meshif(soft_iface); unregister_netdevice_queue(soft_iface, head); } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index ffd49b40e76a..1e71e0c9b47b 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -197,8 +197,11 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, static void batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry) { - if (atomic_dec_and_test(&tt_local_entry->common.refcount)) + if (atomic_dec_and_test(&tt_local_entry->common.refcount)) { + batadv_softif_vlan_free_ref(tt_local_entry->vlan); + kfree_rcu(tt_local_entry, common.rcu); + } } /** @@ -303,9 +306,11 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - hlist_del_init_rcu(&vlan->list); + if (!hlist_unhashed(&vlan->list)) { + hlist_del_init_rcu(&vlan->list); + batadv_orig_node_vlan_free_ref(vlan); + } spin_unlock_bh(&orig_node->vlan_list_lock); - batadv_orig_node_vlan_free_ref(vlan); } batadv_orig_node_vlan_free_ref(vlan); @@ -503,14 +508,26 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global, const char *message) { + struct batadv_tt_global_entry *tt_removed_entry; + struct hlist_node *tt_removed_node; + batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, BATADV_PRINT_VID(tt_global->common.vid), message); - batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, - batadv_choose_tt, &tt_global->common); - batadv_tt_global_entry_free_ref(tt_global); + tt_removed_node = batadv_hash_remove(bat_priv->tt.global_hash, + batadv_compare_tt, + batadv_choose_tt, + &tt_global->common); + if (!tt_removed_node) + return; + + /* drop reference of remove hash entry */ + tt_removed_entry = hlist_entry(tt_removed_node, + struct batadv_tt_global_entry, + common.hash_entry); + batadv_tt_global_entry_free_ref(tt_removed_entry); } /** @@ -597,8 +614,10 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, /* increase the refcounter of the related vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); - if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d", - addr, BATADV_PRINT_VID(vid))) { + if (!vlan) { + net_ratelimited_function(batadv_info, soft_iface, + "adding TT local entry %pM to non-existent VLAN %d\n", + addr, BATADV_PRINT_VID(vid)); kfree(tt_local); tt_local = NULL; goto out; @@ -636,7 +655,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_local_entry_free_ref(tt_local); - batadv_softif_vlan_free_ref(vlan); goto out; } @@ -740,7 +758,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan *vlan; u8 *tt_change_ptr; - rcu_read_lock(); + spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { num_vlan++; num_entries += atomic_read(&vlan->tt.num_entries); @@ -778,7 +796,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: - rcu_read_unlock(); + spin_unlock_bh(&orig_node->vlan_list_lock); return tvlv_len; } @@ -809,15 +827,20 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_softif_vlan *vlan; u16 num_vlan = 0; - u16 num_entries = 0; + u16 vlan_entries = 0; + u16 total_entries = 0; u16 tvlv_len; u8 *tt_change_ptr; int change_offset; - rcu_read_lock(); + spin_lock_bh(&bat_priv->softif_vlan_list_lock); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + vlan_entries = atomic_read(&vlan->tt.num_entries); + if (vlan_entries < 1) + continue; + num_vlan++; - num_entries += atomic_read(&vlan->tt.num_entries); + total_entries += vlan_entries; } change_offset = sizeof(**tt_data); @@ -825,7 +848,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) - *tt_len = batadv_tt_len(num_entries); + *tt_len = batadv_tt_len(total_entries); tvlv_len = *tt_len; tvlv_len += change_offset; @@ -842,8 +865,13 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + vlan_entries = atomic_read(&vlan->tt.num_entries); + if (vlan_entries < 1) + continue; + tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); + tt_vlan->reserved = 0; tt_vlan++; } @@ -852,7 +880,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: - rcu_read_unlock(); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return tvlv_len; } @@ -940,7 +968,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; struct batadv_hard_iface *primary_if; - struct batadv_softif_vlan *vlan; struct hlist_head *head; unsigned short vid; u32 i; @@ -977,13 +1004,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) no_purge = tt_common_entry->flags & np_flag; - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) { - seq_printf(seq, "Cannot retrieve VLAN %d\n", - BATADV_PRINT_VID(vid)); - continue; - } - seq_printf(seq, " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, @@ -1001,9 +1021,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, - vlan->tt.crc); - - batadv_softif_vlan_free_ref(vlan); + tt_local->vlan->tt.crc); } rcu_read_unlock(); } @@ -1046,10 +1064,10 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming) { + struct batadv_tt_local_entry *tt_removed_entry; struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; - struct batadv_softif_vlan *vlan; - void *tt_entry_exists; + struct hlist_node *tt_removed_node; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) @@ -1078,23 +1096,18 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, */ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); - tt_entry_exists = batadv_hash_remove(bat_priv->tt.local_hash, + tt_removed_node = batadv_hash_remove(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local_entry->common); - if (!tt_entry_exists) - goto out; - - /* extra call to free the local tt entry */ - batadv_tt_local_entry_free_ref(tt_local_entry); - - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) + if (!tt_removed_node) goto out; - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + /* drop reference of remove hash entry */ + tt_removed_entry = hlist_entry(tt_removed_node, + struct batadv_tt_local_entry, + common.hash_entry); + batadv_tt_local_entry_free_ref(tt_removed_entry); out: if (tt_local_entry) @@ -1168,7 +1181,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; @@ -1190,14 +1202,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, - tt_common_entry->vid); - if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); - } - batadv_tt_local_entry_free_ref(tt_local); } spin_unlock_bh(list_lock); @@ -1273,7 +1277,8 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, */ static bool batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, - const struct batadv_orig_node *orig_node) + const struct batadv_orig_node *orig_node, + u8 *flags) { struct batadv_tt_orig_list_entry *orig_entry; bool found = false; @@ -1281,25 +1286,64 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node); if (orig_entry) { found = true; + + if (flags) + *flags = orig_entry->flags; + batadv_tt_orig_list_entry_free_ref(orig_entry); } return found; } +/** + * batadv_tt_global_sync_flags - update TT sync flags + * @tt_global: the TT global entry to update sync flags in + * + * Updates the sync flag bits in the tt_global flag attribute with a logical + * OR of all sync flags from any of its TT orig entries. + */ +static void +batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) +{ + struct batadv_tt_orig_list_entry *orig_entry; + const struct hlist_head *head; + u16 flags = BATADV_NO_FLAGS; + + rcu_read_lock(); + head = &tt_global->orig_list; + hlist_for_each_entry_rcu(orig_entry, head, list) + flags |= orig_entry->flags; + rcu_read_unlock(); + + flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK); + tt_global->common.flags = flags; +} + +/** + * batadv_tt_global_orig_entry_add - add or update a TT orig entry + * @tt_global: the TT global entry to add an orig entry in + * @orig_node: the originator to add an orig entry for + * @ttvn: translation table version number of this changeset + * @flags: TT sync flags + */ static void batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, - struct batadv_orig_node *orig_node, int ttvn) + struct batadv_orig_node *orig_node, int ttvn, + u8 flags) { struct batadv_tt_orig_list_entry *orig_entry; + spin_lock_bh(&tt_global->list_lock); + orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (orig_entry) { /* refresh the ttvn: the current value could be a bogus one that * was added during a "temporary client detection" */ orig_entry->ttvn = ttvn; - goto out; + orig_entry->flags = flags; + goto sync_flags; } orig_entry = kzalloc(sizeof(*orig_entry), GFP_ATOMIC); @@ -1311,17 +1355,20 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; + orig_entry->flags = flags; atomic_set(&orig_entry->refcount, 2); - spin_lock_bh(&tt_global->list_lock); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); - spin_unlock_bh(&tt_global->list_lock); atomic_inc(&tt_global->orig_list_count); +sync_flags: + batadv_tt_global_sync_flags(tt_global); out: if (orig_entry) batadv_tt_orig_list_entry_free_ref(orig_entry); + + spin_unlock_bh(&tt_global->list_lock); } /** @@ -1379,7 +1426,9 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, ether_addr_copy(common->addr, tt_addr); common->vid = vid; - common->flags = flags; + if (!is_multicast_ether_addr(common->addr)) + common->flags = flags & (~BATADV_TT_SYNC_MASK); + tt_global_entry->roam_at = 0; /* node must store current time in case of roaming. This is * needed to purge this entry out on timeout (if nobody claims @@ -1420,7 +1469,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, if (!(common->flags & BATADV_TT_CLIENT_TEMP)) goto out; if (batadv_tt_global_entry_has_orig(tt_global_entry, - orig_node)) + orig_node, NULL)) goto out_remove; batadv_tt_global_del_orig_list(tt_global_entry); goto add_orig_entry; @@ -1441,7 +1490,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, * TT_CLIENT_WIFI, therefore they have to be copied in the * client entry */ - tt_global_entry->common.flags |= flags; + if (!is_multicast_ether_addr(common->addr)) + tt_global_entry->common.flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a @@ -1458,7 +1508,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } add_orig_entry: /* add the new orig_entry (if needed) or update it */ - batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn); + batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn, + flags & BATADV_TT_SYNC_MASK); batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", @@ -2111,6 +2162,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; + struct batadv_tt_orig_list_entry *tt_orig; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; @@ -2149,8 +2201,9 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* find out if this global entry is announced by this * originator */ - if (!batadv_tt_global_entry_has_orig(tt_global, - orig_node)) + tt_orig = batadv_tt_global_orig_entry_find(tt_global, + orig_node); + if (!tt_orig) continue; /* use network order to read the VID: this ensures that @@ -2162,10 +2215,12 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* compute the CRC on flags that have to be kept in sync * among nodes */ - flags = tt_common->flags & BATADV_TT_SYNC_MASK; + flags = tt_orig->flags; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); + + batadv_tt_orig_list_entry_free_ref(tt_orig); } rcu_read_unlock(); } @@ -2230,6 +2285,29 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, return crc; } +/** + * batadv_tt_req_node_release - free tt_req node entry + * @ref: kref pointer of the tt req_node entry + */ +static void batadv_tt_req_node_release(struct kref *ref) +{ + struct batadv_tt_req_node *tt_req_node; + + tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); + + kfree(tt_req_node); +} + +/** + * batadv_tt_req_node_put - decrement the tt_req_node refcounter and + * possibly release it + * @tt_req_node: tt_req_node to be free'd + */ +static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node) +{ + kref_put(&tt_req_node->refcount, batadv_tt_req_node_release); +} + static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; @@ -2239,7 +2317,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { hlist_del_init(&node->list); - kfree(node); + batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -2276,7 +2354,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { hlist_del_init(&node->list); - kfree(node); + batadv_tt_req_node_put(node); } } spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -2308,9 +2386,11 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv, if (!tt_req_node) goto unlock; + kref_init(&tt_req_node->refcount); ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; + kref_get(&tt_req_node->refcount); hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list); unlock: spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -2324,17 +2404,24 @@ unlock: * * Returns 1 if the entry is a valid, 0 otherwise. */ -static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) +static int batadv_tt_local_valid(const void *entry_ptr, + const void *data_ptr, + u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW) return 0; + + if (flags) + *flags = tt_common_entry->flags; + return 1; } static int batadv_tt_global_valid(const void *entry_ptr, - const void *data_ptr) + const void *data_ptr, + u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; const struct batadv_tt_global_entry *tt_global_entry; @@ -2348,7 +2435,8 @@ static int batadv_tt_global_valid(const void *entry_ptr, struct batadv_tt_global_entry, common); - return batadv_tt_global_entry_has_orig(tt_global_entry, orig_node); + return batadv_tt_global_entry_has_orig(tt_global_entry, orig_node, + flags); } /** @@ -2364,18 +2452,25 @@ static int batadv_tt_global_valid(const void *entry_ptr, static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, void *tvlv_buff, u16 tt_len, - int (*valid_cb)(const void *, const void *), + int (*valid_cb)(const void *, + const void *, + u8 *flags), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tvlv_tt_change *tt_change; struct hlist_head *head; u16 tt_tot, tt_num_entries = 0; + u8 flags; + bool ret; u32 i; tt_tot = batadv_tt_entries(tt_len); tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff; + if (!valid_cb) + return; + rcu_read_lock(); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2385,11 +2480,12 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, if (tt_tot == tt_num_entries) break; - if ((valid_cb) && (!valid_cb(tt_common_entry, cb_data))) + ret = valid_cb(tt_common_entry, cb_data, &flags); + if (!ret) continue; ether_addr_copy(tt_change->addr, tt_common_entry->addr); - tt_change->flags = tt_common_entry->flags; + tt_change->flags = flags; tt_change->vid = htons(tt_common_entry->vid); memset(tt_change->reserved, 0, sizeof(tt_change->reserved)); @@ -2560,13 +2656,19 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, out: if (primary_if) batadv_hardif_free_ref(primary_if); + if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); - /* hlist_del_init() verifies tt_req_node still is in the list */ - hlist_del_init(&tt_req_node->list); + if (!hlist_unhashed(&tt_req_node->list)) { + hlist_del_init(&tt_req_node->list); + batadv_tt_req_node_put(tt_req_node); + } spin_unlock_bh(&bat_priv->tt.req_list_lock); - kfree(tt_req_node); } + + if (tt_req_node) + batadv_tt_req_node_put(tt_req_node); + kfree(tvlv_tt_data); return ret; } @@ -3002,7 +3104,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, if (!batadv_compare_eth(node->addr, resp_src)) continue; hlist_del_init(&node->list); - kfree(node); + batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -3227,7 +3329,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -3257,13 +3358,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); - if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); - } - batadv_tt_local_entry_free_ref(tt_local); } spin_unlock_bh(list_lock); @@ -3741,8 +3835,10 @@ int batadv_tt_init(struct batadv_priv *bat_priv) return ret; ret = batadv_tt_global_init(bat_priv); - if (ret < 0) + if (ret < 0) { + batadv_tt_local_table_free(bat_priv); return ret; + } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, batadv_tt_tvlv_unicast_handler_v1, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index cbd347c2e4a5..8fce1241ad6d 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -77,11 +77,13 @@ enum batadv_dhcp_recipient { * @ogm_buff: buffer holding the OGM packet * @ogm_buff_len: length of the OGM packet buffer * @ogm_seqno: OGM sequence number - used to identify each OGM + * @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */ struct batadv_hard_iface_bat_iv { unsigned char *ogm_buff; int ogm_buff_len; atomic_t ogm_seqno; + struct mutex ogm_buff_mutex; }; /** @@ -103,7 +105,7 @@ struct batadv_hard_iface_bat_iv { */ struct batadv_hard_iface { struct list_head list; - s16 if_num; + unsigned int if_num; char if_status; struct net_device *net_dev; u8 num_bcasts; @@ -287,7 +289,9 @@ struct batadv_orig_node { DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); u32 last_bcast_seqno; struct hlist_head neigh_list; - /* neigh_list_lock protects: neigh_list and router */ + /* neigh_list_lock protects: neigh_list, ifinfo_list, + * last_bonding_candidate and router + */ spinlock_t neigh_list_lock; struct hlist_node hash_entry; struct batadv_priv *bat_priv; @@ -806,7 +810,7 @@ struct batadv_priv { atomic_t bcast_seqno; atomic_t bcast_queue_left; atomic_t batman_queue_left; - char num_ifaces; + unsigned int num_ifaces; struct kobject *mesh_obj; struct dentry *debug_dir; struct hlist_head forw_bat_list; @@ -884,6 +888,7 @@ struct batadv_socket_packet { * backbone gateway - no bcast traffic is formwared until the situation was * resolved * @crc: crc16 checksum over all claims + * @crc_lock: lock protecting crc * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -897,6 +902,7 @@ struct batadv_bla_backbone_gw { atomic_t wait_periods; atomic_t request_sent; u16 crc; + spinlock_t crc_lock; /* protects crc */ atomic_t refcount; struct rcu_head rcu; }; @@ -915,6 +921,7 @@ struct batadv_bla_claim { u8 addr[ETH_ALEN]; unsigned short vid; struct batadv_bla_backbone_gw *backbone_gw; + spinlock_t backbone_lock; /* protects backbone_gw */ unsigned long lasttime; struct hlist_node hash_entry; struct rcu_head rcu; @@ -947,10 +954,12 @@ struct batadv_tt_common_entry { * struct batadv_tt_local_entry - translation table local entry data * @common: general translation table data * @last_seen: timestamp used for purging stale tt local entries + * @vlan: soft-interface vlan of the entry */ struct batadv_tt_local_entry { struct batadv_tt_common_entry common; unsigned long last_seen; + struct batadv_softif_vlan *vlan; }; /** @@ -973,6 +982,7 @@ struct batadv_tt_global_entry { * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client * @orig_node: pointer to orig node announcing this non-mesh client * @ttvn: translation table version number which added the non-mesh client + * @flags: per orig entry TT sync flags * @list: list node for batadv_tt_global_entry::orig_list * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -980,6 +990,7 @@ struct batadv_tt_global_entry { struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; u8 ttvn; + u8 flags; struct hlist_node list; atomic_t refcount; struct rcu_head rcu; @@ -999,11 +1010,13 @@ struct batadv_tt_change_node { * struct batadv_tt_req_node - data to keep track of the tt requests in flight * @addr: mac address address of the originator this request was sent to * @issued_at: timestamp used for purging stale tt requests + * @refcount: number of contexts the object is used by * @list: list node for batadv_priv_tt::req_list */ struct batadv_tt_req_node { u8 addr[ETH_ALEN]; unsigned long issued_at; + struct kref refcount; struct hlist_node list; }; @@ -1168,9 +1181,9 @@ struct batadv_algo_ops { struct batadv_hard_iface *hard_iface); void (*bat_orig_free)(struct batadv_orig_node *orig_node); int (*bat_orig_add_if)(struct batadv_orig_node *orig_node, - int max_if_num); + unsigned int max_if_num); int (*bat_orig_del_if)(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num); + unsigned int max_if_num, unsigned int del_if_num); }; /** diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4cd6b8d811ff..11602902884b 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -57,6 +57,7 @@ static bool enable_6lowpan; /* We are listening incoming connections via this channel */ static struct l2cap_chan *listen_chan; +static DEFINE_MUTEX(set_lock); struct lowpan_peer { struct list_head list; @@ -1195,12 +1196,14 @@ static void do_enable_set(struct work_struct *work) enable_6lowpan = set_enable->flag; + mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } listen_chan = bt_6lowpan_listen(); + mutex_unlock(&set_lock); kfree(set_enable); } @@ -1252,11 +1255,13 @@ static ssize_t lowpan_control_write(struct file *fp, if (ret == -EINVAL) return ret; + mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); listen_chan = NULL; } + mutex_unlock(&set_lock); if (conn) { struct lowpan_peer *peer; diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 5f123c3320a7..fcd819ffda10 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -233,6 +233,9 @@ static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_info_req req; found = true; + + memset(&req, 0, sizeof(req)); + req.id = cl->id; a2mp_send(mgr, A2MP_GETINFO_REQ, __next_ident(mgr), sizeof(req), &req); @@ -312,6 +315,8 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, if (!hdev || hdev->dev_type != HCI_AMP) { struct a2mp_info_rsp rsp; + memset(&rsp, 0, sizeof(rsp)); + rsp.id = req->id; rsp.status = A2MP_STATUS_INVALID_CTRL_ID; @@ -355,6 +360,8 @@ static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb, if (!ctrl) return -ENOMEM; + memset(&req, 0, sizeof(req)); + req.id = rsp->id; a2mp_send(mgr, A2MP_GETAMPASSOC_REQ, __next_ident(mgr), sizeof(req), &req); @@ -381,6 +388,8 @@ static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb, hdev = hci_dev_get(req->id); if (!hdev || hdev->amp_type == AMP_TYPE_BREDR || tmp) { struct a2mp_amp_assoc_rsp rsp; + + memset(&rsp, 0, sizeof(rsp)); rsp.id = req->id; if (tmp) { @@ -471,7 +480,6 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_cmd *hdr) { struct a2mp_physlink_req *req = (void *) skb->data; - struct a2mp_physlink_rsp rsp; struct hci_dev *hdev; struct hci_conn *hcon; @@ -482,6 +490,8 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, BT_DBG("local_id %d, remote_id %d", req->local_id, req->remote_id); + memset(&rsp, 0, sizeof(rsp)); + rsp.local_id = req->remote_id; rsp.remote_id = req->local_id; @@ -509,6 +519,7 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, assoc = kmemdup(req->amp_assoc, assoc_len, GFP_KERNEL); if (!assoc) { amp_ctrl_put(ctrl); + hci_dev_put(hdev); return -ENOMEM; } @@ -560,6 +571,8 @@ static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, BT_DBG("local_id %d remote_id %d", req->local_id, req->remote_id); + memset(&rsp, 0, sizeof(rsp)); + rsp.local_id = req->remote_id; rsp.remote_id = req->local_id; rsp.status = A2MP_STATUS_SUCCESS; @@ -682,6 +695,8 @@ static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) if (err) { struct a2mp_cmd_rej rej; + memset(&rej, 0, sizeof(rej)); + rej.reason = cpu_to_le16(0); hdr = (void *) skb->data; @@ -905,6 +920,8 @@ void a2mp_send_getinfo_rsp(struct hci_dev *hdev) BT_DBG("%s mgr %p", hdev->name, mgr); + memset(&rsp, 0, sizeof(rsp)); + rsp.id = hdev->id; rsp.status = A2MP_STATUS_INVALID_CTRL_ID; @@ -1002,6 +1019,8 @@ void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status) if (!mgr) return; + memset(&rsp, 0, sizeof(rsp)); + hs_hcon = hci_conn_hash_lookup_state(hdev, AMP_LINK, BT_CONNECT); if (!hs_hcon) { rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION; @@ -1034,6 +1053,8 @@ void a2mp_discover_amp(struct l2cap_chan *chan) mgr->bredr_chan = chan; + memset(&req, 0, sizeof(req)); + req.mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU); req.ext_feat = 0; a2mp_send(mgr, A2MP_DISCOVER_REQ, 1, sizeof(req), &req); diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index e32f34189007..b01b43ab6f83 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -305,6 +305,9 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, struct hci_request req; int err = 0; + if (!mgr) + return; + cp.phy_handle = hcon->handle; cp.len_so_far = cpu_to_le16(0); cp.max_len = cpu_to_le16(hdev->amp_assoc_size); diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h index c32638dddbf9..f6b9dc4e408f 100644 --- a/net/bluetooth/cmtp/cmtp.h +++ b/net/bluetooth/cmtp/cmtp.h @@ -26,7 +26,7 @@ #include <linux/types.h> #include <net/bluetooth/bluetooth.h> -#define BTNAMSIZ 18 +#define BTNAMSIZ 21 /* CMTP ioctl defines */ #define CMTPCONNADD _IOW('C', 200, int) diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 77f73bfa840b..2133b53eb152 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -392,6 +392,11 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) if (!(session->flags & BIT(CMTP_LOOPBACK))) { err = cmtp_attach_device(session); if (err < 0) { + /* Caller will call fput in case of failure, and so + * will cmtp_session kthread. + */ + get_file(session->sock->file); + atomic_inc(&session->terminate); wake_up_interruptible(sk_sleep(session->sock->sk)); up_write(&cmtp_session_sem); @@ -495,9 +500,7 @@ static int __init cmtp_init(void) { BT_INFO("CMTP (CAPI Emulation) ver %s", VERSION); - cmtp_init_sockets(); - - return 0; + return cmtp_init_sockets(); } static void __exit cmtp_exit(void) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 211c2599ab96..6e5316031916 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1173,6 +1173,23 @@ int hci_conn_check_link_mode(struct hci_conn *conn) return 0; } + /* AES encryption is required for Level 4: + * + * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C + * page 1319: + * + * 128-bit equivalent strength for link and encryption keys + * required using FIPS approved algorithms (E0 not allowed, + * SAFER+ not allowed, and P-192 not allowed; encryption key + * not shortened) + */ + if (conn->sec_level == BT_SECURITY_FIPS && + !test_bit(HCI_CONN_AES_CCM, &conn->flags)) { + bt_dev_err(conn->hdev, + "Invalid security: Missing AES-CCM usage"); + return 0; + } + if (hci_conn_ssp_enabled(conn) && !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5b95477c3453..f88076f55ce9 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -371,12 +371,17 @@ static int hci_req_sync(struct hci_dev *hdev, { int ret; - if (!test_bit(HCI_UP, &hdev->flags)) - return -ENETDOWN; - /* Serialize all requests */ hci_req_lock(hdev); - ret = __hci_req_sync(hdev, req, opt, timeout); + /* check the state after obtaing the lock to protect the HCI_UP + * against any races from hci_dev_do_close when the controller + * gets removed. + */ + if (test_bit(HCI_UP, &hdev->flags)) + ret = __hci_req_sync(hdev, req, opt, timeout); + else + ret = -ENETDOWN; + hci_req_unlock(hdev); return ret; @@ -1352,6 +1357,12 @@ int hci_inquiry(void __user *arg) goto done; } + /* Restrict maximum inquiry length to 60 seconds */ + if (ir.length > 60) { + err = -EINVAL; + goto done; + } + hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { @@ -1372,8 +1383,10 @@ int hci_inquiry(void __user *arg) * cleared). If it is interrupted by a signal, return -EINTR. */ if (wait_on_bit(&hdev->flags, HCI_INQUIRY, - TASK_INTERRUPTIBLE)) - return -EINTR; + TASK_INTERRUPTIBLE)) { + err = -EINTR; + goto done; + } } /* for unlimited number of responses we will use buffer with @@ -1548,8 +1561,13 @@ static int hci_dev_do_open(struct hci_dev *hdev) } else { /* Init failed, cleanup */ flush_work(&hdev->tx_work); - flush_work(&hdev->cmd_work); + + /* Since hci_rx_work() is possible to awake new cmd_work + * it should be flushed first to avoid unexpected call of + * hci_cmd_work() + */ flush_work(&hdev->rx_work); + flush_work(&hdev->cmd_work); skb_queue_purge(&hdev->cmd_q); skb_queue_purge(&hdev->rx_q); @@ -1667,6 +1685,14 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_req_cancel(hdev, ENODEV); hci_req_lock(hdev); + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + test_bit(HCI_UP, &hdev->flags)) { + /* Execute vendor specific shutdown routine */ + if (hdev->shutdown) + hdev->shutdown(hdev); + } + if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); hci_req_unlock(hdev); @@ -3433,6 +3459,7 @@ int hci_register_dev(struct hci_dev *hdev) return id; err_wqueue: + debugfs_remove_recursive(hdev->debugfs); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: @@ -3445,14 +3472,10 @@ EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { - int id; - BT_DBG("%pK name %s bus %d", hdev, hdev->name, hdev->bus); hci_dev_set_flag(hdev, HCI_UNREGISTER); - id = hdev->id; - write_lock(&hci_dev_list_lock); list_del(&hdev->list); write_unlock(&hci_dev_list_lock); @@ -3481,7 +3504,14 @@ void hci_unregister_dev(struct hci_dev *hdev) } device_del(&hdev->dev); + /* Actual cleanup is deferred until hci_cleanup_dev(). */ + hci_dev_put(hdev); +} +EXPORT_SYMBOL(hci_unregister_dev); +/* Cleanup HCI device */ +void hci_cleanup_dev(struct hci_dev *hdev) +{ debugfs_remove_recursive(hdev->debugfs); destroy_workqueue(hdev->workqueue); @@ -3501,11 +3531,8 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_discovery_filter_clear(hdev); hci_dev_unlock(hdev); - hci_dev_put(hdev); - - ida_simple_remove(&hci_index_ida, id); + ida_simple_remove(&hci_index_ida, hdev->id); } -EXPORT_SYMBOL(hci_unregister_dev); /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) @@ -4459,7 +4486,14 @@ static void hci_rx_work(struct work_struct *work) hci_send_to_sock(hdev, skb); } - if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { + /* If the device has been opened in HCI_USER_CHANNEL, + * the userspace has exclusive access to device. + * When device is HCI_INIT, we still need to process + * the data packets to the driver in order + * to complete its setup(). + */ + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index cc1b7488861b..60cddf0b36e6 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -41,12 +41,27 @@ /* Handle HCI Event packets */ -static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb, + u8 *new_status) { __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); + /* It is possible that we receive Inquiry Complete event right + * before we receive Inquiry Cancel Command Complete event, in + * which case the latter event should have status of Command + * Disallowed (0x0c). This should not be treated as error, since + * we actually achieve what Inquiry Cancel wants to achieve, + * which is to end the last Inquiry session. + */ + if (status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) { + bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command"); + status = 0x00; + } + + *new_status = status; + if (status) return; @@ -1118,6 +1133,9 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct discovery_state *d = &hdev->discovery; + if (len > HCI_MAX_AD_LENGTH) + return; + bacpy(&d->last_adv_addr, bdaddr); d->last_adv_addr_type = bdaddr_type; d->last_adv_rssi = rssi; @@ -2094,7 +2112,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s num_rsp %d", hdev->name, num_rsp); - if (!num_rsp) + if (!num_rsp || skb->len < num_rsp * sizeof(*info) + 1) return; if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) @@ -2464,7 +2482,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) &cp); } else { clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); - hci_encrypt_cfm(conn, ev->status, 0x00); + hci_encrypt_cfm(conn, ev->status); } } @@ -2550,22 +2568,7 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, conn->enc_key_size = rp->key_size; } - if (conn->state == BT_CONFIG) { - conn->state = BT_CONNECTED; - hci_connect_cfm(conn, 0); - hci_conn_drop(conn); - } else { - u8 encrypt; - - if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) - encrypt = 0x00; - else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) - encrypt = 0x02; - else - encrypt = 0x01; - - hci_encrypt_cfm(conn, 0, encrypt); - } + hci_encrypt_cfm(conn, 0); unlock: hci_dev_unlock(hdev); @@ -2612,24 +2615,20 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); + /* Check link security requirements are met */ + if (!hci_conn_check_link_mode(conn)) + ev->status = HCI_ERROR_AUTH_FAILURE; + if (ev->status && conn->state == BT_CONNECTED) { + /* Notify upper layers so they can cleanup before + * disconnecting. + */ + hci_encrypt_cfm(conn, ev->status); hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); hci_conn_drop(conn); goto unlock; } - /* In Secure Connections Only mode, do not allow any connections - * that are not encrypted with AES-CCM using a P-256 authenticated - * combination key. - */ - if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && - (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || - conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { - hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_drop(conn); - goto unlock; - } - /* Try reading the encryption key size for encrypted ACL links */ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { struct hci_cp_read_enc_key_size cp; @@ -2659,14 +2658,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) } notify: - if (conn->state == BT_CONFIG) { - if (!ev->status) - conn->state = BT_CONNECTED; - - hci_connect_cfm(conn, ev->status); - hci_conn_drop(conn); - } else - hci_encrypt_cfm(conn, ev->status, ev->encrypt); + hci_encrypt_cfm(conn, ev->status); unlock: hci_dev_unlock(hdev); @@ -2758,7 +2750,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, switch (*opcode) { case HCI_OP_INQUIRY_CANCEL: - hci_cc_inquiry_cancel(hdev, skb); + hci_cc_inquiry_cancel(hdev, skb, status); break; case HCI_OP_PERIODIC_INQ: @@ -3609,6 +3601,9 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct inquiry_info_with_rssi_and_pscan_mode *info; info = (void *) (skb->data + 1); + if (skb->len < num_rsp * sizeof(*info) + 1) + goto unlock; + for (; num_rsp; num_rsp--, info++) { u32 flags; @@ -3630,6 +3625,9 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, } else { struct inquiry_info_with_rssi *info = (void *) (skb->data + 1); + if (skb->len < num_rsp * sizeof(*info) + 1) + goto unlock; + for (; num_rsp; num_rsp--, info++) { u32 flags; @@ -3650,6 +3648,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, } } +unlock: hci_dev_unlock(hdev); } @@ -3748,6 +3747,21 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, switch (ev->status) { case 0x00: + /* The synchronous connection complete event should only be + * sent once per new connection. Receiving a successful + * complete event when the connection status is already + * BT_CONNECTED means that the device is misbehaving and sent + * multiple complete event packets for the same new connection. + * + * Registering the device more than once can corrupt kernel + * memory, hence upon detecting this invalid event, we report + * an error and ignore the packet. + */ + if (conn->state == BT_CONNECTED) { + bt_dev_err(hdev, "Ignoring connect complete event for existing connection"); + goto unlock; + } + conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; conn->type = ev->link_type; @@ -3761,6 +3775,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, case 0x11: /* Unsupported Feature or Parameter Value */ case 0x1c: /* SCO interval rejected */ case 0x1a: /* Unsupported Remote Feature */ + case 0x1e: /* Invalid LMP Parameters */ case 0x1f: /* Unspecified error */ case 0x20: /* Unsupported LMP Parameter value */ if (conn->out) { @@ -3811,7 +3826,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, BT_DBG("%s num_rsp %d", hdev->name, num_rsp); - if (!num_rsp) + if (!num_rsp || skb->len < num_rsp * sizeof(*info) + 1) return; if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) @@ -4336,6 +4351,11 @@ static void hci_phy_link_complete_evt(struct hci_dev *hdev, return; } + if (!hcon->amp_mgr) { + hci_dev_unlock(hdev); + return; + } + if (ev->status) { hci_conn_del(hcon); hci_dev_unlock(hdev); @@ -4380,6 +4400,7 @@ static void hci_loglink_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) return; hchan->handle = le16_to_cpu(ev->handle); + hchan->amp = true; BT_DBG("hcon %pK mgr %pK hchan %pK", hcon, hcon->amp_mgr, hchan); @@ -4412,7 +4433,7 @@ static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev, hci_dev_lock(hdev); hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle)); - if (!hchan) + if (!hchan || !hchan->amp) goto unlock; amp_destroy_logical_link(hchan, ev->reason); @@ -4729,6 +4750,11 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, u32 flags; u8 *ptr, real_len; + if (len > HCI_MAX_AD_LENGTH) { + pr_err_ratelimited("legacy adv larger than 31 bytes"); + return; + } + /* Find the end of the data in case the report contains padded zero * bytes at the end causing an invalid length value. * @@ -4789,7 +4815,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type, direct_addr); - if (conn && type == LE_ADV_IND) { + if (conn && type == LE_ADV_IND && len <= HCI_MAX_AD_LENGTH) { /* Store report for later inclusion by * mgmt_device_connected */ @@ -4914,10 +4940,20 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) struct hci_ev_le_advertising_info *ev = ptr; s8 rssi; - rssi = ev->data[ev->length]; - process_adv_report(hdev, ev->evt_type, &ev->bdaddr, - ev->bdaddr_type, NULL, 0, rssi, - ev->data, ev->length); + if (ptr > (void *)skb_tail_pointer(skb) - sizeof(*ev)) { + bt_dev_err(hdev, "Malicious advertising data."); + break; + } + + if (ev->length <= HCI_MAX_AD_LENGTH && + ev->data + ev->length <= skb_tail_pointer(skb)) { + rssi = ev->data[ev->length]; + process_adv_report(hdev, ev->evt_type, &ev->bdaddr, + ev->bdaddr_type, NULL, 0, rssi, + ev->data, ev->length); + } else { + bt_dev_err(hdev, "Dropping invalid advertising data"); + } ptr += sizeof(*ev) + ev->length + 1; } @@ -5105,20 +5141,18 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) { u8 num_reports = skb->data[0]; - void *ptr = &skb->data[1]; + struct hci_ev_le_direct_adv_info *ev = (void *)&skb->data[1]; - hci_dev_lock(hdev); + if (!num_reports || skb->len < num_reports * sizeof(*ev) + 1) + return; - while (num_reports--) { - struct hci_ev_le_direct_adv_info *ev = ptr; + hci_dev_lock(hdev); + for (; num_reports; num_reports--, ev++) process_adv_report(hdev, ev->evt_type, &ev->bdaddr, ev->bdaddr_type, &ev->direct_addr, ev->direct_addr_type, ev->rssi, NULL, 0); - ptr += sizeof(*ev); - } - hci_dev_unlock(hdev); } @@ -5222,6 +5256,11 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) u8 status = 0, event = hdr->evt, req_evt = 0; u16 opcode = HCI_OP_NOP; + if (!event) { + bt_dev_warn(hdev, "Received unexpected HCI Event 00000000"); + goto done; + } + if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->hci.req_event == event) { struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(cmd_hdr->opcode); @@ -5433,6 +5472,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) req_complete_skb(hdev, status, opcode, orig_skb); } +done: kfree_skb(orig_skb); kfree_skb(skb); hdev->stat.evt_rx++; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 4f6f5d89278b..6ab9dda306b9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -53,6 +53,17 @@ struct hci_pinfo { unsigned long flags; }; +static struct hci_dev *hci_hdev_from_sock(struct sock *sk) +{ + struct hci_dev *hdev = hci_pi(sk)->hdev; + + if (!hdev) + return ERR_PTR(-EBADFD); + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return ERR_PTR(-EPIPE); + return hdev; +} + void hci_sock_set_flag(struct sock *sk, int nr) { set_bit(nr, &hci_pi(sk)->flags); @@ -322,7 +333,8 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); - memcpy(ni->name, hdev->name, 8); + memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name, + strnlen(hdev->name, sizeof(ni->name)), '\0'); opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; @@ -480,19 +492,13 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) if (event == HCI_DEV_UNREG) { struct sock *sk; - /* Detach sockets from device */ + /* Wake up sockets using this dead device */ read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { - bh_lock_sock_nested(sk); if (hci_pi(sk)->hdev == hdev) { - hci_pi(sk)->hdev = NULL; sk->sk_err = EPIPE; - sk->sk_state = BT_OPEN; sk->sk_state_change(sk); - - hci_dev_put(hdev); } - bh_unlock_sock(sk); } read_unlock(&hci_sk_list.lock); } @@ -631,10 +637,10 @@ static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { - struct hci_dev *hdev = hci_pi(sk)->hdev; + struct hci_dev *hdev = hci_hdev_from_sock(sk); - if (!hdev) - return -EBADFD; + if (IS_ERR(hdev)) + return PTR_ERR(hdev); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; @@ -766,6 +772,18 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, lock_sock(sk); + /* Allow detaching from dead device and attaching to alive device, if + * the caller wants to re-bind (instead of close) this socket in + * response to hci_sock_dev_event(HCI_DEV_UNREG) notification. + */ + hdev = hci_pi(sk)->hdev; + if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) { + hci_pi(sk)->hdev = NULL; + sk->sk_state = BT_OPEN; + hci_dev_put(hdev); + } + hdev = NULL; + if (sk->sk_state == BT_BOUND) { err = -EALREADY; goto done; @@ -937,9 +955,9 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, lock_sock(sk); - hdev = hci_pi(sk)->hdev; - if (!hdev) { - err = -EBADFD; + hdev = hci_hdev_from_sock(sk); + if (IS_ERR(hdev)) { + err = PTR_ERR(hdev); goto done; } @@ -1191,9 +1209,9 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, goto done; } - hdev = hci_pi(sk)->hdev; - if (!hdev) { - err = -EBADFD; + hdev = hci_hdev_from_sock(sk); + if (IS_ERR(hdev)) { + err = PTR_ERR(hdev); goto done; } diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 4f78b28686ff..a76b1371a7fc 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -180,6 +180,9 @@ ATTRIBUTE_GROUPS(bt_host); static void bt_host_release(struct device *dev) { struct hci_dev *hdev = to_hci_dev(dev); + + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) + hci_cleanup_dev(hdev); kfree(hdev); module_put(THIS_MODULE); } diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index e614940c4e98..8e47a5392129 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -735,7 +735,7 @@ static void hidp_stop(struct hid_device *hid) hid->claimed = 0; } -static struct hid_ll_driver hidp_hid_driver = { +struct hid_ll_driver hidp_hid_driver = { .parse = hidp_parse, .start = hidp_start, .stop = hidp_stop, @@ -744,6 +744,7 @@ static struct hid_ll_driver hidp_hid_driver = { .raw_request = hidp_raw_request, .output_report = hidp_output_report, }; +EXPORT_SYMBOL_GPL(hidp_hid_driver); /* This function sets up the hid device. It does not add it to the HID system. That is done in hidp_add_connection(). */ @@ -1283,7 +1284,7 @@ static int hidp_session_thread(void *arg) /* cleanup runtime environment */ remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); - remove_wait_queue(sk_sleep(session->intr_sock->sk), &ctrl_wait); + remove_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait); wake_up_interruptible(&session->report_queue); hidp_del_timer(session); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 9497acbf9154..820c78945e16 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -403,6 +403,9 @@ static void l2cap_chan_timeout(struct work_struct *work) BT_DBG("chan %pK state %s", chan, state_to_string(chan->state)); mutex_lock(&conn->chan_lock); + /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling + * this work. No need to call l2cap_chan_hold(chan) here again. + */ l2cap_chan_lock(chan); if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG) @@ -415,12 +418,12 @@ static void l2cap_chan_timeout(struct work_struct *work) l2cap_chan_close(chan, reason); - l2cap_chan_unlock(chan); - chan->ops->close(chan); - mutex_unlock(&conn->chan_lock); + l2cap_chan_unlock(chan); l2cap_chan_put(chan); + + mutex_unlock(&conn->chan_lock); } struct l2cap_chan *l2cap_chan_create(void) @@ -431,6 +434,8 @@ struct l2cap_chan *l2cap_chan_create(void) if (!chan) return NULL; + skb_queue_head_init(&chan->tx_q); + skb_queue_head_init(&chan->srej_q); mutex_init(&chan->lock); /* Set default lock nesting level */ @@ -496,7 +501,9 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) chan->flush_to = L2CAP_DEFAULT_FLUSH_TO; chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO; chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO; + chan->conf_state = 0; + set_bit(CONF_NOT_COMPLETE, &chan->conf_state); set_bit(FLAG_FORCE_ACTIVE, &chan->flags); } @@ -1714,9 +1721,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_del(chan, err); - l2cap_chan_unlock(chan); - chan->ops->close(chan); + + l2cap_chan_unlock(chan); l2cap_chan_put(chan); } @@ -4093,7 +4100,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, return 0; } - if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2) { + if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2 && + chan->state != BT_CONNECTED) { cmd_reject_invalid_cid(conn, cmd->ident, chan->scid, chan->dcid); goto unlock; @@ -4316,6 +4324,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, return 0; } + l2cap_chan_hold(chan); l2cap_chan_lock(chan); rsp.dcid = cpu_to_le16(chan->scid); @@ -4324,12 +4333,11 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, chan->ops->set_shutdown(chan); - l2cap_chan_hold(chan); l2cap_chan_del(chan, ECONNRESET); - l2cap_chan_unlock(chan); - chan->ops->close(chan); + + l2cap_chan_unlock(chan); l2cap_chan_put(chan); mutex_unlock(&conn->chan_lock); @@ -4361,20 +4369,21 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, return 0; } + l2cap_chan_hold(chan); l2cap_chan_lock(chan); if (chan->state != BT_DISCONN) { l2cap_chan_unlock(chan); + l2cap_chan_put(chan); mutex_unlock(&conn->chan_lock); return 0; } - l2cap_chan_hold(chan); l2cap_chan_del(chan, 0); - l2cap_chan_unlock(chan); - chan->ops->close(chan); + + l2cap_chan_unlock(chan); l2cap_chan_put(chan); mutex_unlock(&conn->chan_lock); @@ -4898,10 +4907,8 @@ void __l2cap_physical_cfm(struct l2cap_chan *chan, int result) BT_DBG("chan %pK, result %d, local_amp_id %d, remote_amp_id %d", chan, result, local_amp_id, remote_amp_id); - if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) { - l2cap_chan_unlock(chan); + if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) return; - } if (chan->state != BT_CONNECTED) { l2cap_do_create(chan, result, local_amp_id, remote_amp_id); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index bd3c021932be..6c127f4ac3a2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1038,7 +1038,7 @@ done: } /* Kill socket (only if zapped and orphan) - * Must be called on unlocked socket. + * Must be called on unlocked socket, with l2cap channel lock. */ static void l2cap_sock_kill(struct sock *sk) { @@ -1189,6 +1189,7 @@ static int l2cap_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err; + struct l2cap_chan *chan; BT_DBG("sock %pK, sk %pK", sock, sk); @@ -1198,9 +1199,17 @@ static int l2cap_sock_release(struct socket *sock) bt_sock_unlink(&l2cap_sk_list, sk); err = l2cap_sock_shutdown(sock, 2); + chan = l2cap_pi(sk)->chan; + + l2cap_chan_hold(chan); + l2cap_chan_lock(chan); sock_orphan(sk); l2cap_sock_kill(sk); + + l2cap_chan_unlock(chan); + l2cap_chan_put(chan); + return err; } @@ -1218,12 +1227,15 @@ static void l2cap_sock_cleanup_listen(struct sock *parent) BT_DBG("child chan %pK state %s", chan, state_to_string(chan->state)); + l2cap_chan_hold(chan); l2cap_chan_lock(chan); + __clear_chan_timer(chan); l2cap_chan_close(chan, ECONNRESET); - l2cap_chan_unlock(chan); - l2cap_sock_kill(sk); + + l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } } @@ -1297,6 +1309,9 @@ static void l2cap_sock_close_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return; + l2cap_sock_kill(sk); } @@ -1305,6 +1320,9 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) struct sock *sk = chan->data; struct sock *parent; + if (!sk) + return; + BT_DBG("chan %pK state %s", chan, state_to_string(chan->state)); /* This callback can be called both for server (BT_LISTEN) @@ -1318,8 +1336,6 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) parent = bt_sk(sk)->parent; - sock_set_flag(sk, SOCK_ZAPPED); - switch (chan->state) { case BT_OPEN: case BT_BOUND: @@ -1346,8 +1362,11 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) break; } - release_sock(sk); + + /* Only zap after cleanup to avoid use after free race */ + sock_set_flag(sk, SOCK_ZAPPED); + } static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state, @@ -1473,8 +1492,10 @@ static void l2cap_sock_destruct(struct sock *sk) { BT_DBG("sk %pK", sk); - if (l2cap_pi(sk)->chan) + if (l2cap_pi(sk)->chan) { + l2cap_pi(sk)->chan->data = NULL; l2cap_chan_put(l2cap_pi(sk)->chan); + } if (l2cap_pi(sk)->rx_busy_skb) { kfree_skb(l2cap_pi(sk)->rx_busy_skb); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1f4a4a790df6..9ffabc81057c 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -212,12 +212,15 @@ static u8 mgmt_status_table[] = { MGMT_STATUS_TIMEOUT, /* Instant Passed */ MGMT_STATUS_NOT_SUPPORTED, /* Pairing Not Supported */ MGMT_STATUS_FAILED, /* Transaction Collision */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_INVALID_PARAMS, /* Unacceptable Parameter */ MGMT_STATUS_REJECTED, /* QoS Rejected */ MGMT_STATUS_NOT_SUPPORTED, /* Classification Not Supported */ MGMT_STATUS_REJECTED, /* Insufficient Security */ MGMT_STATUS_INVALID_PARAMS, /* Parameter Out Of Range */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_BUSY, /* Role Switch Pending */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_FAILED, /* Slot Violation */ MGMT_STATUS_FAILED, /* Role Switch Failed */ MGMT_STATUS_INVALID_PARAMS, /* EIR Too Large */ @@ -628,7 +631,8 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (lmp_ssp_capable(hdev)) { settings |= MGMT_SETTING_SSP; - settings |= MGMT_SETTING_HS; + if (IS_ENABLED(CONFIG_BT_HS)) + settings |= MGMT_SETTING_HS; } if (lmp_sc_capable(hdev)) @@ -2430,6 +2434,10 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) BT_DBG("request for %s", hdev->name); + if (!IS_ENABLED(CONFIG_BT_HS)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, + MGMT_STATUS_NOT_SUPPORTED); + status = mgmt_bredr_support(hdev); if (status) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 71f8126be12b..d3f29174caa5 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -413,10 +413,8 @@ static int __rfcomm_create_dev(struct sock *sk, void __user *arg) dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel); if (IS_ERR(dlc)) return PTR_ERR(dlc); - else if (dlc) { - rfcomm_dlc_put(dlc); + if (dlc) return -EBUSY; - } dlc = rfcomm_dlc_alloc(GFP_KERNEL); if (!dlc) return -ENOMEM; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 46c3f086a261..4c20ceaf3089 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -83,7 +83,6 @@ static void sco_sock_timeout(unsigned long arg) sk->sk_state_change(sk); bh_unlock_sock(sk); - sco_sock_kill(sk); sock_put(sk); } @@ -175,7 +174,6 @@ static void sco_conn_del(struct hci_conn *hcon, int err) sco_sock_clear_timer(sk); sco_chan_del(sk, err); bh_unlock_sock(sk); - sco_sock_kill(sk); sock_put(sk); } @@ -271,7 +269,8 @@ done: return err; } -static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) +static int sco_send_frame(struct sock *sk, void *buf, int len, + unsigned int msg_flags) { struct sco_conn *conn = sco_pi(sk)->conn; struct sk_buff *skb; @@ -283,15 +282,11 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) BT_DBG("sk %pK len %d", sk, len); - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); + skb = bt_skb_send_alloc(sk, len, msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - kfree_skb(skb); - return -EFAULT; - } - + memcpy(skb_put(skb, len), buf, len); hci_send_sco(conn->hcon, skb); return len; @@ -392,8 +387,7 @@ static void sco_sock_cleanup_listen(struct sock *parent) */ static void sco_sock_kill(struct sock *sk) { - if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || - sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %pK state %d", sk, sk->sk_state); @@ -445,7 +439,6 @@ static void sco_sock_close(struct sock *sk) lock_sock(sk); __sco_sock_close(sk); release_sock(sk); - sco_sock_kill(sk); } static void sco_sock_init(struct sock *sk, struct sock *parent) @@ -704,6 +697,7 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + void *buf; int err; BT_DBG("sock %pK, sk %pK", sock, sk); @@ -715,14 +709,24 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (memcpy_from_msg(buf, msg, len)) { + kfree(buf); + return -EFAULT; + } + lock_sock(sk); if (sk->sk_state == BT_CONNECTED) - err = sco_send_frame(sk, msg, len); + err = sco_send_frame(sk, buf, len, msg->msg_flags); else err = -ENOTCONN; release_sock(sk); + kfree(buf); return err; } @@ -763,6 +767,11 @@ static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; + default: + /* use CVSD settings as fallback */ + cp.max_latency = cpu_to_le16(0xffff); + cp.retrans_effort = 0xff; + break; } hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index d3114dfd8adf..1d9c517dd3d3 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2654,6 +2654,15 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*key)) return SMP_INVALID_PARAMS; + /* Check if remote and local public keys are the same and debug key is + * not in use. + */ + if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) && + !crypto_memneq(key, smp->local_pk, 64)) { + bt_dev_err(hdev, "Remote and local public keys are identical"); + return SMP_UNSPECIFIED; + } + memcpy(smp->remote_pk, key, 64); if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) { diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0346c215ff6a..49c29fa7fd30 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -167,6 +167,7 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, sum.rx_packets += tmp.rx_packets; } + netdev_stats_to_stats64(stats, &dev->stats); stats->tx_bytes = sum.tx_bytes; stats->tx_packets = sum.tx_packets; stats->rx_bytes = sum.rx_bytes; @@ -200,6 +201,12 @@ static int br_set_mac_address(struct net_device *dev, void *p) if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; + /* dev_set_mac_addr() can be called by a master device on bridge's + * NETDEV_UNREGISTER, but since it's being destroyed do nothing + */ + if (dev->reg_state != NETREG_REGISTERED) + return -EBUSY; + spin_lock_bh(&br->lock); if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { /* Mac address will be changed in br_stp_change_bridge_id(). */ diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 09442e0f7f67..1aa1d3d4979f 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -266,7 +266,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) /* If old entry was unassociated with any port, then delete it. */ f = __br_fdb_get(br, br->dev->dev_addr, 0); - if (f && f->is_local && !f->dst) + if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -281,7 +281,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (!br_vlan_should_use(v)) continue; f = __br_fdb_get(br, br->dev->dev_addr, v->vid); - if (f && f->is_local && !f->dst) + if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); } @@ -758,20 +758,25 @@ out: } /* Update (create or replace) forwarding database entry */ -static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, - __u16 state, __u16 flags, __u16 vid) +static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, + const __u8 *addr, __u16 state, __u16 flags, __u16 vid) { - struct net_bridge *br = source->br; struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; bool modified = false; /* If the port cannot learn allow only local and static entries */ - if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) && + if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && !(source->state == BR_STATE_LEARNING || source->state == BR_STATE_FORWARDING)) return -EPERM; + if (!source && !(state & NUD_PERMANENT)) { + pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n", + br->dev->name); + return -EINVAL; + } + fdb = fdb_find(head, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -826,22 +831,28 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, return 0; } -static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p, - const unsigned char *addr, u16 nlh_flags, u16 vid) +static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, + struct net_bridge_port *p, const unsigned char *addr, + u16 nlh_flags, u16 vid) { int err = 0; if (ndm->ndm_flags & NTF_USE) { + if (!p) { + pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n", + br->dev->name); + return -EINVAL; + } local_bh_disable(); rcu_read_lock(); - br_fdb_update(p->br, p, addr, vid, true); + br_fdb_update(br, p, addr, vid, true); rcu_read_unlock(); local_bh_enable(); } else { - spin_lock_bh(&p->br->hash_lock); - err = fdb_add_entry(p, addr, ndm->ndm_state, + spin_lock_bh(&br->hash_lock); + err = fdb_add_entry(br, p, addr, ndm->ndm_state, nlh_flags, vid); - spin_unlock_bh(&p->br->hash_lock); + spin_unlock_bh(&br->hash_lock); } return err; @@ -878,6 +889,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], dev->name); return -EINVAL; } + br = p->br; vg = nbp_vlan_group(p); } @@ -889,15 +901,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - if (dev->priv_flags & IFF_EBRIDGE) - err = br_fdb_insert(br, NULL, addr, vid); - else - err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid); } else { - if (dev->priv_flags & IFF_EBRIDGE) - err = br_fdb_insert(br, NULL, addr, 0); - else - err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0); if (err || !vg || !vg->num_vlans) goto out; @@ -908,11 +914,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - if (dev->priv_flags & IFF_EBRIDGE) - err = br_fdb_insert(br, NULL, addr, v->vid); - else - err = __br_fdb_add(ndm, p, addr, nlh_flags, - v->vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid); if (err) goto out; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index e24754a0e052..920b7c0f1e2d 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -78,13 +78,10 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; - if (dev->flags & IFF_NOARP) + if ((dev->flags & IFF_NOARP) || + !pskb_may_pull(skb, arp_hdr_len(dev))) return; - if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev->stats.tx_dropped++; - return; - } parp = arp_hdr(skb); if (parp->ar_pro != htons(ETH_P_IP) || diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 2ae0451fd634..f580dbaac5a9 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -638,6 +638,9 @@ static unsigned int br_nf_forward_arp(void *priv, nf_bridge_pull_encap_header(skb); } + if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr)))) + return NF_DROP; + if (arp_hdr(skb)->ar_pln != 4) { if (IS_VLAN_ARP(skb)) nf_bridge_push_encap_header(skb); @@ -708,9 +711,17 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); @@ -728,8 +739,6 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; - nf_bridge_update_protocol(skb); - data = this_cpu_ptr(&brnf_frag_data_storage); data->vlan_tci = skb->vlan_tci; @@ -752,8 +761,6 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; - nf_bridge_update_protocol(skb); - data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; @@ -801,8 +808,6 @@ static unsigned int br_nf_post_routing(void *priv, else return NF_ACCEPT; - /* We assume any code from br_dev_queue_push_xmit onwards doesn't care - * about the value of skb->pkt_type. */ if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index a7953962112a..1b848a45047b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -225,8 +225,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) } masterv = br_vlan_get_master(br, v->vid); - if (!masterv) + if (!masterv) { + err = -ENOMEM; goto out_filt; + } v->brvlan = masterv; } diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c index 517e78befcb2..61a9f1be1263 100644 --- a/net/bridge/netfilter/ebt_limit.c +++ b/net/bridge/netfilter/ebt_limit.c @@ -105,6 +105,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = { .match = ebt_limit_mt, .checkentry = ebt_limit_mt_check, .matchsize = sizeof(struct ebt_limit_info), + .usersize = offsetof(struct ebt_limit_info, prev), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct ebt_compat_limit_info), #endif diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index d9471e3ef216..e7c170949b21 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1883,7 +1883,7 @@ static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz) } static int ebt_buf_add(struct ebt_entries_buf_state *state, - void *data, unsigned int sz) + const void *data, unsigned int sz) { if (state->buf_kern_start == NULL) goto count_only; @@ -1917,7 +1917,7 @@ enum compat_mwt { EBT_COMPAT_TARGET, }; -static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, +static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, enum compat_mwt compat_mwt, struct ebt_entries_buf_state *state, const unsigned char *base) @@ -1994,22 +1994,23 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, * return size of all matches, watchers or target, including necessary * alignment and padding. */ -static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, +static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32, unsigned int size_left, enum compat_mwt type, struct ebt_entries_buf_state *state, const void *base) { + const char *buf = (const char *)match32; int growth = 0; - char *buf; if (size_left == 0) return 0; - buf = (char *) match32; - - while (size_left >= sizeof(*match32)) { + do { struct ebt_entry_match *match_kern; int ret; + if (size_left < sizeof(*match32)) + return -EINVAL; + match_kern = (struct ebt_entry_match *) state->buf_kern_start; if (match_kern) { char *tmp; @@ -2046,22 +2047,18 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - /* rule should have no remaining data after target */ - if (type == EBT_COMPAT_TARGET && size_left) - return -EINVAL; - match32 = (struct compat_ebt_entry_mwt *) buf; - } + } while (size_left); return growth; } /* called for all ebt_entry structures. */ -static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, +static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base, unsigned int *total, struct ebt_entries_buf_state *state) { - unsigned int i, j, startoff, new_offset = 0; + unsigned int i, j, startoff, next_expected_off, new_offset = 0; /* stores match/watchers/targets & offset of next struct ebt_entry: */ unsigned int offsets[4]; unsigned int *offsets_update = NULL; @@ -2149,11 +2146,13 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, return ret; } - startoff = state->buf_user_offset - startoff; + next_expected_off = state->buf_user_offset - startoff; + if (next_expected_off != entry->next_offset) + return -EINVAL; - if (WARN_ON(*total < startoff)) + if (*total < entry->next_offset) return -EINVAL; - *total -= startoff; + *total -= entry->next_offset; return 0; } @@ -2175,7 +2174,9 @@ static int compat_copy_entries(unsigned char *data, unsigned int size_user, if (ret < 0) return ret; - WARN_ON(size_remaining); + if (size_remaining) + return -EINVAL; + return state->buf_kern_offset; } diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 6e48aa69fa24..d7af67a3f19c 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -35,6 +35,12 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, ether_addr_copy(eth->h_dest, eth_hdr(oldskb)->h_source); eth->h_proto = eth_hdr(oldskb)->h_proto; skb_pull(nskb, ETH_HLEN); + + if (skb_vlan_tag_present(oldskb)) { + u16 vid = skb_vlan_tag_get(oldskb); + + __vlan_hwaccel_put_tag(nskb, oldskb->vlan_proto, vid); + } } /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index a0443d40d677..a28ffbbf7450 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -303,7 +303,7 @@ static void dev_flowctrl(struct net_device *dev, int on) caifd_put(caifd); } -void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, +int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct cflayer *link_support, int head_room, struct cflayer **layer, int (**rcv_func)(struct sk_buff *, struct net_device *, @@ -314,11 +314,12 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, enum cfcnfg_phy_preference pref; struct cfcnfg *cfg = get_cfcnfg(dev_net(dev)); struct caif_device_entry_list *caifdevs; + int res; caifdevs = caif_device_list(dev_net(dev)); caifd = caif_device_alloc(dev); if (!caifd) - return; + return -ENOMEM; *layer = &caifd->layer; spin_lock_init(&caifd->flow_lock); @@ -340,7 +341,7 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, sizeof(caifd->layer.name) - 1); caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; caifd->layer.transmit = transmit; - cfcnfg_add_phy_layer(cfg, + res = cfcnfg_add_phy_layer(cfg, dev, &caifd->layer, pref, @@ -350,6 +351,7 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, mutex_unlock(&caifdevs->lock); if (rcv_func) *rcv_func = receive; + return res; } EXPORT_SYMBOL(caif_enroll_dev); @@ -364,6 +366,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, struct cflayer *layer, *link_support; int head_room = 0; struct caif_device_entry_list *caifdevs; + int res; cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); @@ -389,8 +392,10 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; } } - caif_enroll_dev(dev, caifdev, link_support, head_room, + res = caif_enroll_dev(dev, caifdev, link_support, head_room, &layer, NULL); + if (res) + cfserl_release(link_support); caifdev->flowctrl = dev_flowctrl; break; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index aa209b1066c9..3cfd413aa2c8 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -539,7 +539,8 @@ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, goto err; ret = -EINVAL; - if (unlikely(msg->msg_iter.iov->iov_base == NULL)) + if (unlikely(msg->msg_iter.nr_segs == 0) || + unlikely(msg->msg_iter.iov->iov_base == NULL)) goto err; noblock = msg->msg_flags & MSG_DONTWAIT; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 5cd44f001f64..485dde566c1a 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -116,6 +116,11 @@ static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], return (struct cflayer *) this; } +static void cfusbl_release(struct cflayer *layer) +{ + kfree(layer); +} + static struct packet_type caif_usb_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_EX1), }; @@ -128,6 +133,7 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, struct cflayer *layer, *link_support; struct usbnet *usbnet; struct usb_device *usbdev; + int res; /* Check whether we have a NCM device, and find its VID/PID. */ if (!(dev->dev.parent && dev->dev.parent->driver && @@ -170,8 +176,11 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, if (dev->num_tx_queues > 1) pr_warn("USB device uses more than one tx queue\n"); - caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN, + res = caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN, &layer, &caif_usb_type.func); + if (res) + goto err; + if (!pack_added) dev_add_pack(&caif_usb_type); pack_added = true; @@ -181,6 +190,9 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, layer->name[sizeof(layer->name) - 1] = 0; return 0; +err: + cfusbl_release(link_support); + return res; } static struct notifier_block caif_device_notifier = { diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index fa39fc298708..c45b531a6cd5 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -455,7 +455,7 @@ unlock: rcu_read_unlock(); } -void +int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, @@ -464,7 +464,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, { struct cflayer *frml; struct cfcnfg_phyinfo *phyinfo = NULL; - int i; + int i, res = 0; u8 phyid; mutex_lock(&cnfg->lock); @@ -478,12 +478,15 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); + res = -EEXIST; goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); - if (!phyinfo) + if (!phyinfo) { + res = -ENOMEM; goto out_err; + } phy_layer->id = phyid; phyinfo->pref = pref; @@ -497,8 +500,10 @@ got_phyid: frml = cffrml_create(phyid, fcs); - if (!frml) + if (!frml) { + res = -ENOMEM; goto out_err; + } phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); @@ -516,11 +521,12 @@ got_phyid: list_add_rcu(&phyinfo->node, &cnfg->phys); out: mutex_unlock(&cnfg->lock); - return; + return res; out_err: kfree(phyinfo); mutex_unlock(&cnfg->lock); + return res; } EXPORT_SYMBOL(cfcnfg_add_phy_layer); diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c index ce60f06d76de..af1e1e36dc90 100644 --- a/net/caif/cfserl.c +++ b/net/caif/cfserl.c @@ -31,6 +31,11 @@ static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); +void cfserl_release(struct cflayer *layer) +{ + kfree(layer); +} + struct cflayer *cfserl_create(int instance, bool use_stx) { struct cfserl *this = kzalloc(sizeof(struct cfserl), GFP_ATOMIC); diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 67a4a36febd1..40f032f62029 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -56,20 +56,6 @@ struct chnl_net { enum caif_states state; }; -static void robust_list_del(struct list_head *delete_node) -{ - struct list_head *list_node; - struct list_head *n; - ASSERT_RTNL(); - list_for_each_safe(list_node, n, &chnl_net_list) { - if (list_node == delete_node) { - list_del(list_node); - return; - } - } - WARN_ON(1); -} - static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct sk_buff *skb; @@ -371,6 +357,7 @@ static int chnl_net_init(struct net_device *dev) ASSERT_RTNL(); priv = netdev_priv(dev); strncpy(priv->name, dev->name, sizeof(priv->name)); + INIT_LIST_HEAD(&priv->list_field); return 0; } @@ -379,7 +366,7 @@ static void chnl_net_uninit(struct net_device *dev) struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); - robust_list_del(&priv->list_field); + list_del_init(&priv->list_field); } static const struct net_device_ops netdev_ops = { @@ -542,7 +529,7 @@ static void __exit chnl_exit_module(void) rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); - list_del(list_node); + list_del_init(list_node); delete_device(dev); } rtnl_unlock(); diff --git a/net/can/bcm.c b/net/can/bcm.c index 1f15622d3c65..549ee0de456f 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -121,7 +121,7 @@ struct bcm_sock { struct sock sk; int bound; int ifindex; - struct notifier_block notifier; + struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; @@ -129,6 +129,10 @@ struct bcm_sock { char procname [32]; /* inode number in decimal with \0 */ }; +static LIST_HEAD(bcm_notifier_list); +static DEFINE_SPINLOCK(bcm_notifier_lock); +static struct bcm_sock *bcm_busy_notifier; + static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; @@ -392,6 +396,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data) if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ + memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; @@ -439,6 +444,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) /* this element is not throttled anymore */ data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV); + memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; @@ -550,6 +556,7 @@ static void bcm_rx_timeout_tsklet(unsigned long data) struct bcm_msg_head msg_head; /* create notification to user */ + memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; @@ -730,21 +737,21 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id, static void bcm_remove_op(struct bcm_op *op) { if (op->tsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || - hrtimer_active(&op->timer)) { - hrtimer_cancel(&op->timer); + do { tasklet_kill(&op->tsklet); - } + hrtimer_cancel(&op->timer); + } while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || + test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || + hrtimer_active(&op->timer)); } if (op->thrtsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || - hrtimer_active(&op->thrtimer)) { - hrtimer_cancel(&op->thrtimer); + do { tasklet_kill(&op->thrtsklet); - } + hrtimer_cancel(&op->thrtimer); + } while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || + test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || + hrtimer_active(&op->thrtimer)); } if ((op->frames) && (op->frames != &op->sframe)) @@ -806,6 +813,7 @@ static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex) bcm_rx_handler, op); list_del(&op->list); + synchronize_rcu(); bcm_remove_op(op); return 1; /* done */ } @@ -1385,20 +1393,15 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) /* * notification handler for netdevice status changes */ -static int bcm_notifier(struct notifier_block *nb, unsigned long msg, - void *ptr) +static void bcm_notify(struct bcm_sock *bo, unsigned long msg, + struct net_device *dev) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier); struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; + return; switch (msg) { @@ -1433,7 +1436,28 @@ static int bcm_notifier(struct notifier_block *nb, unsigned long msg, sk->sk_error_report(sk); } } +} +static int bcm_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) + return NOTIFY_DONE; + if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ + return NOTIFY_DONE; + + spin_lock(&bcm_notifier_lock); + list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { + spin_unlock(&bcm_notifier_lock); + bcm_notify(bcm_busy_notifier, msg, dev); + spin_lock(&bcm_notifier_lock); + } + bcm_busy_notifier = NULL; + spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } @@ -1453,9 +1477,9 @@ static int bcm_init(struct sock *sk) INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ - bo->notifier.notifier_call = bcm_notifier; - - register_netdevice_notifier(&bo->notifier); + spin_lock(&bcm_notifier_lock); + list_add_tail(&bo->notifier, &bcm_notifier_list); + spin_unlock(&bcm_notifier_lock); return 0; } @@ -1476,7 +1500,14 @@ static int bcm_release(struct socket *sock) /* remove bcm_ops, timer, rx_unregister(), etc. */ - unregister_netdevice_notifier(&bo->notifier); + spin_lock(&bcm_notifier_lock); + while (bcm_busy_notifier == bo) { + spin_unlock(&bcm_notifier_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&bcm_notifier_lock); + } + list_del(&bo->notifier); + spin_unlock(&bcm_notifier_lock); lock_sock(sk); @@ -1508,9 +1539,13 @@ static int bcm_release(struct socket *sock) REGMASK(op->can_id), bcm_rx_handler, op); - bcm_remove_op(op); } + synchronize_rcu(); + + list_for_each_entry_safe(op, next, &bo->rx_ops, list) + bcm_remove_op(op); + /* remove procfs entry */ if (proc_dir && bo->bcm_proc_read) remove_proc_entry(bo->procname, proc_dir); @@ -1662,6 +1697,10 @@ static const struct can_proto bcm_can_proto = { .prot = &bcm_proto, }; +static struct notifier_block canbcm_notifier = { + .notifier_call = bcm_notifier +}; + static int __init bcm_module_init(void) { int err; @@ -1676,6 +1715,8 @@ static int __init bcm_module_init(void) /* create /proc/net/can-bcm directory */ proc_dir = proc_mkdir("can-bcm", init_net.proc_net); + register_netdevice_notifier(&canbcm_notifier); + return 0; } @@ -1685,6 +1726,8 @@ static void __exit bcm_module_exit(void) if (proc_dir) remove_proc_entry("can-bcm", init_net.proc_net); + + unregister_netdevice_notifier(&canbcm_notifier); } module_init(bcm_module_init); diff --git a/net/can/gw.c b/net/can/gw.c index 81650affa3fa..1867000f8a65 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -497,6 +497,7 @@ static int cgw_notifier(struct notifier_block *nb, if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -941,6 +942,7 @@ static void cgw_remove_all_jobs(void) hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -1008,6 +1010,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) hlist_del(&gwj->list); cgw_unregister_filter(gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); err = 0; break; diff --git a/net/can/raw.c b/net/can/raw.c index e9403a26a1d5..1c2bf97ca168 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -84,7 +84,7 @@ struct raw_sock { struct sock sk; int bound; int ifindex; - struct notifier_block notifier; + struct list_head notifier; int loopback; int recv_own_msgs; int fd_frames; @@ -96,6 +96,10 @@ struct raw_sock { struct uniqframe __percpu *uniq; }; +static LIST_HEAD(raw_notifier_list); +static DEFINE_SPINLOCK(raw_notifier_lock); +static struct raw_sock *raw_busy_notifier; + /* * Return pointer to store the extra msg flags for raw_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' @@ -260,21 +264,16 @@ static int raw_enable_allfilters(struct net_device *dev, struct sock *sk) return err; } -static int raw_notifier(struct notifier_block *nb, - unsigned long msg, void *ptr) +static void raw_notify(struct raw_sock *ro, unsigned long msg, + struct net_device *dev) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); struct sock *sk = &ro->sk; if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; + return; if (ro->ifindex != dev->ifindex) - return NOTIFY_DONE; + return; switch (msg) { @@ -303,7 +302,28 @@ static int raw_notifier(struct notifier_block *nb, sk->sk_error_report(sk); break; } +} + +static int raw_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) + return NOTIFY_DONE; + if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */ + return NOTIFY_DONE; + spin_lock(&raw_notifier_lock); + list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) { + spin_unlock(&raw_notifier_lock); + raw_notify(raw_busy_notifier, msg, dev); + spin_lock(&raw_notifier_lock); + } + raw_busy_notifier = NULL; + spin_unlock(&raw_notifier_lock); return NOTIFY_DONE; } @@ -332,9 +352,9 @@ static int raw_init(struct sock *sk) return -ENOMEM; /* set notifier */ - ro->notifier.notifier_call = raw_notifier; - - register_netdevice_notifier(&ro->notifier); + spin_lock(&raw_notifier_lock); + list_add_tail(&ro->notifier, &raw_notifier_list); + spin_unlock(&raw_notifier_lock); return 0; } @@ -349,7 +369,14 @@ static int raw_release(struct socket *sock) ro = raw_sk(sk); - unregister_netdevice_notifier(&ro->notifier); + spin_lock(&raw_notifier_lock); + while (raw_busy_notifier == ro) { + spin_unlock(&raw_notifier_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&raw_notifier_lock); + } + list_del(&ro->notifier); + spin_unlock(&raw_notifier_lock); lock_sock(sk); @@ -514,10 +541,18 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; } + rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) + if (ro->bound && ro->ifindex) { dev = dev_get_by_index(&init_net, ro->ifindex); + if (!dev) { + if (count > 1) + kfree(filter); + err = -ENODEV; + goto out_fil; + } + } if (ro->bound) { /* (try to) register the new filters */ @@ -554,6 +589,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, dev_put(dev); release_sock(sk); + rtnl_unlock(); break; @@ -566,10 +602,16 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, err_mask &= CAN_ERR_MASK; + rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) + if (ro->bound && ro->ifindex) { dev = dev_get_by_index(&init_net, ro->ifindex); + if (!dev) { + err = -ENODEV; + goto out_err; + } + } /* remove current error mask */ if (ro->bound) { @@ -591,6 +633,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, dev_put(dev); release_sock(sk); + rtnl_unlock(); break; @@ -857,6 +900,10 @@ static const struct can_proto raw_can_proto = { .prot = &raw_proto, }; +static struct notifier_block canraw_notifier = { + .notifier_call = raw_notifier +}; + static __init int raw_module_init(void) { int err; @@ -866,6 +913,8 @@ static __init int raw_module_init(void) err = can_proto_register(&raw_can_proto); if (err < 0) printk(KERN_ERR "can: registration of raw protocol failed\n"); + else + register_netdevice_notifier(&canraw_notifier); return err; } @@ -873,6 +922,7 @@ static __init int raw_module_init(void) static __exit void raw_module_exit(void) { can_proto_unregister(&raw_can_proto); + unregister_netdevice_notifier(&canraw_notifier); } module_init(raw_module_init); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3ed2796d008b..3fbc312e43ce 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2976,6 +2976,11 @@ static void con_fault(struct ceph_connection *con) ceph_msg_put(con->in_msg); con->in_msg = NULL; } + if (con->out_msg) { + BUG_ON(con->out_msg->con != con); + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); diff --git a/net/compat.c b/net/compat.c index d67684010455..53e58bfe0dc6 100644 --- a/net/compat.c +++ b/net/compat.c @@ -159,7 +159,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, if (kcmlen > stackbuf_size) kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL); if (kcmsg == NULL) - return -ENOBUFS; + return -ENOMEM; /* Now copy them over neatly. */ memset(kcmsg, 0, kcmlen); @@ -284,6 +284,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) break; } /* Bump the usage count and install the file. */ + __receive_sock(fp[i]); fd_install(new_fd, get_file(fp[i])); } @@ -355,7 +356,8 @@ static int do_set_sock_timeout(struct socket *sock, int level, static int compat_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { - if (optname == SO_ATTACH_FILTER) + if (optname == SO_ATTACH_FILTER || + optname == SO_ATTACH_REUSEPORT_CBPF) return do_set_attach_filter(sock, level, optname, optval, optlen); if (!COMPAT_USE_64BIT_TIME && diff --git a/net/core/Makefile b/net/core/Makefile index 0d35bba614a9..5e3c5fe87379 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o tso.o + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/dev.c b/net/core/dev.c index 1aa1261a5934..79f7b6fe9ce1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -82,6 +82,7 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> @@ -94,6 +95,7 @@ #include <linux/ethtool.h> #include <linux/notifier.h> #include <linux/skbuff.h> +#include <linux/bpf.h> #include <net/net_namespace.h> #include <net/sock.h> #include <linux/rtnetlink.h> @@ -187,7 +189,7 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); -static seqcount_t devnet_rename_seq; +static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { @@ -864,33 +866,28 @@ EXPORT_SYMBOL(dev_get_by_index); * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. - * - * The use of raw_seqcount_begin() and cond_resched() before - * retrying is required as we want to give the writers a chance - * to complete when CONFIG_PREEMPT is not set. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; - unsigned int seq; + int ret; -retry: - seq = raw_seqcount_begin(&devnet_rename_seq); + down_read(&devnet_rename_sem); rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { - rcu_read_unlock(); - return -ENODEV; + ret = -ENODEV; + goto out; } strcpy(name, dev->name); - rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) { - cond_resched(); - goto retry; - } - return 0; + ret = 0; +out: + rcu_read_unlock(); + up_read(&devnet_rename_sem); + return ret; } /** @@ -1155,10 +1152,10 @@ int dev_change_name(struct net_device *dev, const char *newname) if (dev->flags & IFF_UP) return -EBUSY; - write_seqcount_begin(&devnet_rename_seq); + down_write(&devnet_rename_sem); if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return 0; } @@ -1166,7 +1163,7 @@ int dev_change_name(struct net_device *dev, const char *newname) err = dev_get_valid_name(net, dev, newname); if (err < 0) { - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return err; } @@ -1181,11 +1178,11 @@ rollback: if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); dev->name_assign_type = old_assign_type; - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return ret; } - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); netdev_adjacent_rename_links(dev, oldname); @@ -1206,7 +1203,7 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - write_seqcount_begin(&devnet_rename_seq); + down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); dev->name_assign_type = old_assign_type; @@ -1794,19 +1791,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_orphan_frags(skb, GFP_ATOMIC) || - unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } + int ret = ____dev_forward_skb(dev, skb); - skb_scrub_packet(skb, true); - skb->priority = 0; - skb->protocol = eth_type_trans(skb, dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } - return 0; + return ret; } EXPORT_SYMBOL_GPL(__dev_forward_skb); @@ -3014,7 +3006,8 @@ static void skb_update_prio(struct sk_buff *skb) struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); if (!skb->priority && skb->sk && map) { - unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + unsigned int prioidx = + sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; @@ -3027,8 +3020,6 @@ static void skb_update_prio(struct sk_buff *skb) DEFINE_PER_CPU(int, xmit_recursion); EXPORT_SYMBOL(xmit_recursion); -#define RECURSION_LIMIT 10 - /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in @@ -3220,8 +3211,8 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) int cpu = smp_processor_id(); /* ok because BHs are off */ if (txq->xmit_lock_owner != cpu) { - - if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + if (unlikely(__this_cpu_read(xmit_recursion) > + XMIT_RECURSION_LIMIT)) goto recursion_alert; skb = validate_xmit_skb(skb, dev); @@ -4328,6 +4319,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; NAPI_GRO_CB(skb)->recursion_counter = 0; + NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ @@ -4747,11 +4739,18 @@ EXPORT_SYMBOL(__napi_schedule); * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * - * Variant of __napi_schedule() assuming hard irqs are masked + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { - ____napi_schedule(this_cpu_ptr(&softnet_data), n); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); @@ -4876,13 +4875,14 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, pr_err_once("netif_napi_add() called with weight %d on device %s\n", weight, dev->name); napi->weight = weight; - list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL spin_lock_init(&napi->poll_lock); napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + set_bit(NAPI_STATE_NPSVC, &napi->state); + list_add_rcu(&napi->dev_list, &dev->napi_list); } EXPORT_SYMBOL(netif_napi_add); @@ -5820,7 +5820,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)) + bool (*type_check)(const struct net_device *dev)) { struct net_device *lower = NULL; struct list_head *iter; @@ -6162,7 +6162,8 @@ static int __dev_set_mtu(struct net_device *dev, int new_mtu) if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); - dev->mtu = new_mtu; + /* Pairs with all the lockless reads of dev->mtu in the stack */ + WRITE_ONCE(dev->mtu, new_mtu); return 0; } @@ -6327,6 +6328,38 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) EXPORT_SYMBOL(dev_change_proto_down); /** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @fd: new program fd or negative value to clear + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, int fd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct bpf_prog *prog = NULL; + struct netdev_xdp xdp = {}; + int err; + + if (!ops->ndo_xdp) + return -EOPNOTSUPP; + if (fd >= 0) { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + xdp.command = XDP_SETUP_PROG; + xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); + if (err < 0 && prog) + bpf_prog_put(prog); + + return err; +} +EXPORT_SYMBOL(dev_change_xdp_fd); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * @@ -6483,11 +6516,13 @@ static void netdev_sync_lower_features(struct net_device *upper, netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); lower->wanted_features &= ~feature; - netdev_update_features(lower); + __netdev_update_features(lower); if (unlikely(lower->features & feature)) netdev_WARN(upper, "failed to disable %pNF on %s!\n", &feature, lower->name); + else + netdev_features_change(lower); } } } @@ -6876,6 +6911,13 @@ int register_netdevice(struct net_device *dev) rcu_barrier(); dev->reg_state = NETREG_UNREGISTERED; + /* We should put the kobject that hold in + * netdev_unregister_kobject(), otherwise + * the net device cannot be freed when + * driver calls free_netdev(), because the + * kobject is being hold. + */ + kobject_put(&dev->dev.kobj); } /* * Prevent userspace races by waiting until the network @@ -7808,7 +7850,7 @@ static void __net_exit default_device_exit(struct net *net) continue; /* Leave virtual devices for the generic cleanup */ - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index a2270188b864..9bcc6fdade3e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -159,6 +159,7 @@ static void sched_send_work(unsigned long _data) static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; + struct net_dm_drop_point *point; struct nlmsghdr *nlh; struct nlattr *nla; int i; @@ -177,11 +178,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location) nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); + point = msg->points; for (i = 0; i < msg->entries; i++) { - if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { - msg->points[i].count++; + if (!memcmp(&location, &point->pc, sizeof(void *))) { + point->count++; goto out; } + point++; } if (msg->entries == dm_hit_limit) goto out; @@ -190,8 +193,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location) */ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); - memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); - msg->points[msg->entries].count = 1; + memcpy(point->pc, &location, sizeof(void *)); + point->count = 1; msg->entries++; if (!timer_pending(&data->send_timer)) { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 7e4e7deb2542..b0c4440e8514 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -613,6 +613,37 @@ void netdev_rss_key_fill(void *buffer, size_t len) } EXPORT_SYMBOL(netdev_rss_key_fill); +static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -709,6 +740,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (user_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(indir); @@ -868,6 +907,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rss_config); @@ -1210,6 +1257,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels; + u32 max_rx_in_use = 0; if (!dev->ethtool_ops->set_channels) return -EOPNOTSUPP; @@ -1217,6 +1265,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index cb744a352167..dcd40a44b93d 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -638,7 +638,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh = nlmsg_data(nlh); frh->family = ops->family; - frh->table = rule->table; + frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) @@ -765,7 +765,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, { struct net *net; struct sk_buff *skb; - int err = -ENOBUFS; + int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); diff --git a/net/core/filter.c b/net/core/filter.c index 3c5f51198c41..573af321ee93 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> +#include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> @@ -50,6 +51,7 @@ #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> +#include <net/sock_reuseport.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -77,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) + return err; + err = security_sock_rcv_skb(sk, skb); if (err) return err; @@ -84,7 +90,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + struct sock *save_sk = skb->sk; + unsigned int pkt_len; + + skb->sk = sk; + pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); @@ -93,14 +104,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) { - return skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff(skb); } -static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -119,9 +129,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -144,11 +153,17 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_0(__get_raw_cpu_id) { return raw_smp_processor_id(); } +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = __get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -226,9 +241,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); - BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ @@ -348,12 +362,6 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * jump offsets, 2nd pass remapping: * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); - * - * User BPF's register A is mapped to our BPF register 6, user BPF - * register X is mapped to BPF register 7; frame pointer is always - * register 10; Context 'void *ctx' is stored in register 1, that is, - * for socket filters: ctx == 'struct sk_buff *', for seccomp: - * ctx == 'struct seccomp_data *'. */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len) @@ -381,9 +389,22 @@ do_pass: new_insn = new_prog; fp = prog; - if (new_insn) - *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); - new_insn++; + /* Classic BPF related prologue emission. */ + if (new_insn) { + /* Classic BPF expects A and X to be reset first. These need + * to be guaranteed to be the first two instructions. + */ + *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + + /* All programs must keep CTX in callee saved BPF_REG_CTX. + * In eBPF case it's done by the compiler, here we need to + * do this ourself. Initial CTX is present in BPF_REG_ARG1. + */ + *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + } else { + new_insn += 3; + } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[6] = { }; @@ -491,14 +512,27 @@ do_pass: break; } - /* Convert JEQ into JNE when 'jump_true' is next insn. */ - if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { - insn->code = BPF_JMP | BPF_JNE | bpf_src; + /* Convert some jumps when 'jump_true' is next insn. */ + if (fp->jt == 0) { + switch (BPF_OP(fp->code)) { + case BPF_JEQ: + insn->code = BPF_JMP | BPF_JNE | bpf_src; + break; + case BPF_JGT: + insn->code = BPF_JMP | BPF_JLE | bpf_src; + break; + case BPF_JGE: + insn->code = BPF_JMP | BPF_JLT | bpf_src; + break; + default: + goto jmp_rest; + } + target = i + fp->jf + 1; BPF_EMIT_JMP; break; } - +jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; @@ -526,12 +560,14 @@ do_pass: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - /* RET_K, RET_A are remaped into 2 insns. */ + /* RET_K is remaped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: - *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? - BPF_K : BPF_X, BPF_REG_0, - BPF_REG_A, fp->k); + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); *insn = BPF_EXIT_INSN(); break; @@ -996,7 +1032,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - err = bpf_prog_select_runtime(fp); + fp = bpf_prog_select_runtime(fp, &err); if (err) goto out_err_free; @@ -1153,8 +1189,7 @@ void bpf_prog_destroy(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_destroy); -static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, - bool locked) +static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; @@ -1170,45 +1205,61 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, return -ENOMEM; } - old_fp = rcu_dereference_protected(sk->sk_filter, locked); + old_fp = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); + if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } -/** - * sk_attach_filter - attach a socket filter - * @fprog: the filter program - * @sk: the socket to use - * - * Attach the user's filter code. We first run some sanity checks on - * it to make sure it does not explode on us later. If an error - * occurs or there is insufficient memory for the filter a negative - * errno code is returned. On success the return is zero. - */ -int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, - bool locked) +static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct bpf_prog *old_prog; + int err; + + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + return -ENOMEM; + + if (sk_unhashed(sk) && sk->sk_reuseport) { + err = reuseport_alloc(sk); + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + + old_prog = reuseport_attach_prog(sk, prog); + if (old_prog) + bpf_prog_destroy(old_prog); + + return 0; +} + +static +struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; + return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) - return -EINVAL; + return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); - return -EFAULT; + return ERR_PTR(-EFAULT); } prog->len = fprog->len; @@ -1216,17 +1267,34 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog, NULL); + return bpf_prepare_filter(prog, NULL); +} + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + if (IS_ERR(prog)) return PTR_ERR(prog); - err = __sk_attach_prog(prog, sk, locked); + err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; @@ -1234,31 +1302,59 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, return 0; } -EXPORT_SYMBOL_GPL(__sk_attach_filter); +EXPORT_SYMBOL_GPL(sk_attach_filter); -int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + __bpf_prog_release(prog); + return err; + } + + return 0; +} + +static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { - return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk)); + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return ERR_PTR(-EPERM); + + return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog; + struct bpf_prog *prog = __get_bpf(ufd, sk); int err; - if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; - - prog = bpf_prog_get(ufd); if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { + err = __sk_attach_prog(prog, sk); + if (err < 0) { bpf_prog_put(prog); - return -EINVAL; + return err; } - err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk)); + return 0; +} + +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; @@ -1267,49 +1363,74 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) +struct bpf_scratchpad { + union { + __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; + u8 buff[MAX_BPF_STACK]; + }; +}; + +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); + +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_ensure_writable(skb, write_len); +} + +static inline int bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_end(skb); + return err; +} + +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + +static inline void bpf_push_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} -static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) +static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} + +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - int offset = (int) r2; - void *from = (void *) (long) r3; - unsigned int len = (unsigned int) r4; - char buf[16]; void *ptr; - /* bpf verifier guarantees that: - * 'from' pointer points to bpf program stack - * 'len' bytes of it were initialized - * 'len' > 0 - * 'skb' is a valid pointer to 'struct sk_buff' - * - * so check for invalid 'offset' and too large 'len' - */ - if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) + return -EINVAL; + if (unlikely(offset > INT_MAX)) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + len))) + if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, len, buf); - if (unlikely(!ptr)) - return -EFAULT; - - if (BPF_RECOMPUTE_CSUM(flags)) - skb_postpull_rcsum(skb, ptr, len); + ptr = skb->data + offset; + if (flags & BPF_F_RECOMPUTE_CSUM) + __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); - if (ptr == buf) - /* skb_store_bits cannot return -EFAULT here */ - skb_store_bits(skb, offset, ptr, len); + if (flags & BPF_F_RECOMPUTE_CSUM) + __skb_postpush_rcsum(skb, ptr, len, offset); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); - if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); return 0; } -const struct bpf_func_proto bpf_skb_store_bytes_proto = { +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1320,26 +1441,78 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; -#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) -#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, + void *, to, u32, len) +{ + void *ptr; + + if (unlikely(offset > INT_MAX)) + goto err_clear; + + ptr = skb_header_pointer(skb, offset, len, to); + if (unlikely(!ptr)) + goto err_clear; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { + .func = bpf_skb_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_RAW_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, +}; + +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; -static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - int offset = (int) r2; - __sum16 sum, *ptr; + __sum16 *ptr; - if (unlikely((u32) offset > 0xffff)) + if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) + return -EINVAL; + if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; - - if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); - if (unlikely(!ptr)) - return -EFAULT; + ptr = (__sum16 *)(skb->data + offset); + switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + csum_replace_by_diff(ptr, to); + break; case 2: csum_replace2(ptr, from, to); break; @@ -1350,14 +1523,10 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } - if (ptr == &sum) - /* skb_store_bits guaranteed to not return -EFAULT here */ - skb_store_bits(skb, offset, ptr, sizeof(sum)); - return 0; } -const struct bpf_func_proto bpf_l3_csum_replace_proto = { +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1368,23 +1537,32 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); - int offset = (int) r2; - __sum16 sum, *ptr; + bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; + __sum16 *ptr; - if (unlikely((u32) offset > 0xffff)) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | + BPF_F_HDR_FIELD_MASK))) + return -EINVAL; + if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); - if (unlikely(!ptr)) - return -EFAULT; + ptr = (__sum16 *)(skb->data + offset); + if (is_mmzero && !*ptr) + return 0; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1395,14 +1573,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } - if (ptr == &sum) - /* skb_store_bits guaranteed to not return -EFAULT here */ - skb_store_bits(skb, offset, ptr, sizeof(sum)); - + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; return 0; } -const struct bpf_func_proto bpf_l4_csum_replace_proto = { +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1413,30 +1589,182 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) +BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, + __be32 *, to, u32, to_size, __wsum, seed) +{ + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); + u32 diff_size = from_size + to_size; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; -static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) + return csum_partial(sp->diff, diff_size, seed); +} + +static const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) +{ + return dev_forward_skb(dev, skb); +} + +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, + struct sk_buff *skb) +{ + int ret = ____dev_forward_skb(dev, skb); + + if (likely(!ret)) { + skb->dev = dev; + ret = netif_rx(skb); + } + + return ret; +} + +static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) +{ + int ret; + + if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + kfree_skb(skb); + return -ENETDOWN; + } + + skb->dev = dev; + + __this_cpu_inc(xmit_recursion); + ret = dev_queue_xmit(skb); + __this_cpu_dec(xmit_recursion); + + return ret; +} + +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* skb->mac_len is not set on normal egress */ + unsigned int mlen = skb->network_header - skb->mac_header; + + __skb_pull(skb, mlen); + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + skb_pop_mac_header(skb); + skb_reset_mac_len(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + bpf_push_mac_rcsum(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: +#ifdef ARPHRD_RAWIP + case ARPHRD_RAWIP: +#endif + return __bpf_redirect_no_mac(skb, dev, flags); + default: + return __bpf_redirect_common(skb, dev, flags); + } +} + +BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; struct net_device *dev; + struct sk_buff *clone; + int ret; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; - skb2 = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb2)) + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) return -ENOMEM; - if (BPF_IS_REDIRECT_INGRESS(flags)) - return dev_forward_skb(dev, skb2); + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } - skb2->dev = dev; - skb_sender_cpu_clear(skb2); - return dev_queue_xmit(skb2); + return __bpf_redirect(clone, dev, flags); } -const struct bpf_func_proto bpf_clone_redirect_proto = { +static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1451,12 +1779,17 @@ struct redirect_info { }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); -static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) + +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + if (unlikely(flags & ~(BPF_F_INGRESS))) + return TC_ACT_SHOT; + ri->ifindex = ifindex; ri->flags = flags; + return TC_ACT_REDIRECT; } @@ -1472,15 +1805,10 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - if (BPF_IS_REDIRECT_INGRESS(ri->flags)) - return dev_forward_skb(dev, skb); - - skb->dev = dev; - skb_sender_cpu_clear(skb); - return dev_queue_xmit(skb); + return __bpf_redirect(skb, dev, ri->flags); } -const struct bpf_func_proto bpf_redirect_proto = { +static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1488,9 +1816,9 @@ const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { - return task_get_classid((struct sk_buff *) (unsigned long) r1); + return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { @@ -1500,16 +1828,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { -#ifdef CONFIG_IP_ROUTE_CLASSID - const struct dst_entry *dst; - - dst = skb_dst((struct sk_buff *) (unsigned long) r1); - if (dst) - return dst->tclassid; -#endif - return 0; + return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { @@ -1519,16 +1840,54 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 vlan_proto = (__force __be16) r2; + /* If skb_clear_hash() was called due to mangling, we can + * trigger SW recalculation here. Later access to hash + * can then use the inline skb->hash via context directly + * instead of calling this helper again. + */ + return skb_get_hash(skb); +} + +static const struct bpf_func_proto bpf_get_hash_recalc_proto = { + .func = bpf_get_hash_recalc, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, + u16, vlan_tci) +{ + int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); - return skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_push_mac_rcsum(skb); + ret = skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_pull_mac_rcsum(skb); + + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_push_proto = { @@ -1541,11 +1900,16 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); -static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; + int ret; + + bpf_push_mac_rcsum(skb); + ret = skb_vlan_pop(skb); + bpf_pull_mac_rcsum(skb); - return skb_vlan_pop(skb); + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_pop_proto = { @@ -1556,59 +1920,487 @@ const struct bpf_func_proto bpf_skb_vlan_pop_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); +static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) +{ + /* Caller already did skb_cow() with len as headroom, + * so no need to do it here. + */ + skb_push(skb, len); + memmove(skb->data, skb->data + len, off); + memset(skb->data + off, 0, len); + + /* No skb_postpush_rcsum(skb, skb->data + off, len) + * needed here as it does not change the skb->csum + * result for checksum complete when summing over + * zeroed blocks. + */ + return 0; +} + +static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) +{ + /* skb_ensure_writable() is not needed here, as we're + * already working on an uncloned skb. + */ + if (unlikely(!pskb_may_pull(skb, off + len))) + return -ENOMEM; + + skb_postpull_rcsum(skb, skb->data + off, len); + memmove(skb->data + len, skb->data, off); + __skb_pull(skb, len); + + return 0; +} + +static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* There's no need for __skb_push()/__skb_pull() pair to + * get to the start of the mac header as we're guaranteed + * to always start from here under eBPF. + */ + ret = bpf_skb_generic_push(skb, off, len); + if (likely(!ret)) { + skb->mac_header -= len; + skb->network_header -= len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* Same here, __skb_push()/__skb_pull() pair not needed. */ + ret = bpf_skb_generic_pop(skb, off, len); + if (likely(!ret)) { + skb->mac_header += len; + skb->network_header += len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb->network_header - skb->mac_header; + int ret; + + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to + * be changed into SKB_GSO_TCPV6. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + } + + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + skb->protocol = htons(ETH_P_IPV6); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb->network_header - skb->mac_header; + int ret; + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to + * be changed into SKB_GSO_TCPV4. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { + skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + } + + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + skb->protocol = htons(ETH_P_IP); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) +{ + __be16 from_proto = skb->protocol; + + if (from_proto == htons(ETH_P_IP) && + to_proto == htons(ETH_P_IPV6)) + return bpf_skb_proto_4_to_6(skb); + + if (from_proto == htons(ETH_P_IPV6) && + to_proto == htons(ETH_P_IP)) + return bpf_skb_proto_6_to_4(skb); + + return -ENOTSUPP; +} + +BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, + u64, flags) +{ + int ret; + + if (unlikely(flags)) + return -EINVAL; + + /* General idea is that this helper does the basic groundwork + * needed for changing the protocol, and eBPF program fills the + * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() + * and other helpers, rather than passing a raw buffer here. + * + * The rationale is to keep this minimal and without a need to + * deal with raw packet data. F.e. even if we would pass buffers + * here, the program still needs to call the bpf_lX_csum_replace() + * helpers anyway. Plus, this way we keep also separation of + * concerns, since f.e. bpf_skb_store_bytes() should only take + * care of stores. + * + * Currently, additional options and extension header space are + * not supported, but flags register is reserved so we can adapt + * that. For offloads, we mark packet as dodgy, so that headers + * need to be verified first. + */ + ret = bpf_skb_proto_xlat(skb, proto); + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_proto_proto = { + .func = bpf_skb_change_proto, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) +{ + /* We only allow a restricted subset to be changed for now. */ + if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || + !skb_pkt_type_ok(pkt_type))) + return -EINVAL; + + skb->pkt_type = pkt_type; + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_type_proto = { + .func = bpf_skb_change_type, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +static u32 __bpf_skb_min_len(const struct sk_buff *skb) +{ + u32 min_len = skb_network_offset(skb); + + if (skb_transport_header_was_set(skb)) + min_len = skb_transport_offset(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) + min_len = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + return min_len; +} + +#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC + +static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + unsigned int old_len = skb->len; + int ret; + + ret = __skb_grow_rcsum(skb, new_len); + if (!ret) + memset(skb->data + old_len, 0, new_len - old_len); + return ret; +} + +static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + return __skb_trim_rcsum(skb, new_len); +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + u32 max_len = BPF_SKB_MAX_LEN; + u32 min_len = __bpf_skb_min_len(skb); + int ret; + + if (unlikely(flags || new_len > max_len || new_len < min_len)) + return -EINVAL; + if (skb->encapsulation) + return -ENOTSUPP; + + /* The basic idea of this helper is that it's performing the + * needed work to either grow or trim an skb, and eBPF program + * rewrites the rest via helpers like bpf_skb_store_bytes(), + * bpf_lX_csum_replace() and others rather than passing a raw + * buffer here. This one is a slow path helper and intended + * for replies with control messages. + * + * Like in bpf_skb_change_proto(), we want to keep this rather + * minimal and without protocol specifics so that we are able + * to separate concerns as in bpf_skb_store_bytes() should only + * be the one responsible for writing buffers. + * + * It's really expected to be a slow path operation here for + * control message replies, so we're implicitly linearizing, + * uncloning and drop offloads from the skb by this. + */ + ret = __bpf_try_make_writable(skb, skb->len); + if (!ret) { + if (new_len > skb->len) + ret = bpf_skb_grow_rcsum(skb, new_len); + else if (new_len < skb->len) + ret = bpf_skb_trim_rcsum(skb, new_len); + if (!ret && skb_is_gso(skb)) + skb_gso_reset(skb); + } + + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_tail_proto = { + .func = bpf_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { - if (func == bpf_skb_vlan_push) - return true; - if (func == bpf_skb_vlan_pop) - return true; - if (func == bpf_skb_store_bytes) - return true; - if (func == bpf_l3_csum_replace) - return true; - if (func == bpf_l4_csum_replace) + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_tail || + func == bpf_skb_pull_data || + func == bpf_clone_redirect || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace) return true; return false; } -static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, + unsigned long off, unsigned long len) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; - struct ip_tunnel_info *info = skb_tunnel_info(skb); + void *ptr = skb_header_pointer(skb, off, len, dst_buff); - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) - return -EINVAL; - if (ip_tunnel_info_af(info) != AF_INET) + if (unlikely(!ptr)) + return len; + if (ptr != dst_buff) + memcpy(dst_buff, ptr, len); + + return 0; +} + +BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; + if (unlikely(skb_size > skb->len)) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, + bpf_skb_copy); +} + +static const struct bpf_func_proto bpf_skb_event_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + +static unsigned short bpf_tunnel_key_af(u64 flags) +{ + return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; +} + +BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, + u32, size, u64, flags) +{ + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + u8 compat[sizeof(struct bpf_tunnel_key)]; + void *to_orig = to; + int err; + + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + err = -EINVAL; + goto err_clear; + } + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { + err = -EPROTO; + goto err_clear; + } + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + err = -EINVAL; + switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + goto set_compat; + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + if (ip_tunnel_info_af(info) != AF_INET) + goto err_clear; +set_compat: + to = (struct bpf_tunnel_key *)compat; + break; + default: + goto err_clear; + } + } to->tunnel_id = be64_to_cpu(info->key.tun_id); - to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + to->tunnel_tos = info->key.tos; + to->tunnel_ttl = info->key.ttl; + + if (flags & BPF_F_TUNINFO_IPV6) { + memcpy(to->remote_ipv6, &info->key.u.ipv6.src, + sizeof(to->remote_ipv6)); + to->tunnel_label = be32_to_cpu(info->key.label); + } else { + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + } + + if (unlikely(size != sizeof(struct bpf_tunnel_key))) + memcpy(to_orig, to, size); return 0; +err_clear: + memset(to_orig, 0, size); + return err; } -const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_PTR_TO_RAW_STACK, .arg3_type = ARG_CONST_STACK_SIZE, .arg4_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) +{ + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + int err; + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + err = -ENOENT; + goto err_clear; + } + if (unlikely(size < info->options_len)) { + err = -ENOMEM; + goto err_clear; + } + + ip_tunnel_info_opts_get(to, info); + if (size > info->options_len) + memset(to + info->options_len, 0, size - info->options_len); + + return info->options_len; +err_clear: + memset(to, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_RAW_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; -static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, + const struct bpf_tunnel_key *, from, u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); + u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT))) + return -EINVAL; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + memcpy(compat, from, size); + memset(compat + size, 0, sizeof(compat) - size); + from = (const struct bpf_tunnel_key *) compat; + break; + default: + return -EINVAL; + } + } + if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || + from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); @@ -1617,14 +2409,31 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY; + + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + info->key.tun_id = cpu_to_be64(from->tunnel_id); - info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.tos = from->tunnel_tos; + info->key.ttl = from->tunnel_ttl; + + if (flags & BPF_F_TUNINFO_IPV6) { + info->mode |= IP_TUNNEL_INFO_IPV6; + memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, + sizeof(from->remote_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; + } else { + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; + } return 0; } -const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1634,19 +2443,145 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, + const u8 *, from, u32, size) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > IP_TUNNEL_OPTS_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } +} + +BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, + u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + struct sock *sk; + + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) + return -ENOENT; + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return sk_under_cgroup_hierarchy(sk, cgrp); +} + +static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { + .func = bpf_skb_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, + unsigned long off, unsigned long len) +{ + memcpy(dst_buff, src_buff + off, len); + return 0; +} + +BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, + bpf_xdp_copy); +} + +static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + +BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) +{ + return skb->sk ? sock_gen_cookie(skb->sk) : 0; +} + +static const struct bpf_func_proto bpf_get_socket_cookie_proto = { + .func = bpf_get_socket_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) +{ + struct sock *sk = sk_to_full_sk(skb->sk); + kuid_t kuid; + + if (!sk || !sk_fullsock(sk)) + return overflowuid; + kuid = sock_net_uid(sock_net(sk), sk); + return from_kuid_munged(sock_net(sk)->user_ns, kuid); } +static const struct bpf_func_proto bpf_get_socket_uid_proto = { + .func = bpf_get_socket_uid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1660,7 +2595,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; + return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: @@ -1668,6 +2603,12 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; default: return NULL; } @@ -1679,6 +2620,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -1691,14 +2640,58 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_change_proto: + return &bpf_skb_change_proto_proto; + case BPF_FUNC_skb_change_type: + return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +xdp_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_xdp_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; default: return sk_filter_func_proto(func_id); } @@ -1706,31 +2699,32 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static bool __is_valid_access(int off, int size, enum bpf_access_type type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; - - /* disallow misaligned access */ + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - - /* all __sk_buff fields are __u32 */ - if (size != 4) + if (size != sizeof(__u32)) return false; return true; } static bool sk_filter_is_valid_access(int off, int size, - enum bpf_access_type type) + enum bpf_access_type type, + enum bpf_reg_type *reg_type) { - if (off == offsetof(struct __sk_buff, tc_classid)) + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + case offsetof(struct __sk_buff, data): + case offsetof(struct __sk_buff, data_end): return false; + } if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): break; default: return false; @@ -1740,31 +2734,140 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } -static bool tc_cls_act_is_valid_access(int off, int size, - enum bpf_access_type type) +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + return true; +} + +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) { - if (off == offsetof(struct __sk_buff, tc_classid)) - return type == BPF_WRITE ? true : false; + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); + *insn++ = BPF_EXIT_INSN(); + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +static bool tc_cls_act_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, tc_index): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): + case offsetof(struct __sk_buff, tc_classid): break; default: return false; } } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + return __is_valid_access(off, size, type); } -static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, - struct bpf_insn *insn_buf, - struct bpf_prog *prog) +static bool __is_valid_xdp_access(int off, int size, + enum bpf_access_type type) +{ + if (off < 0 || off >= sizeof(struct xdp_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool xdp_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct xdp_md, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct xdp_md, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_xdp_access(off, size, type); +} + +void bpf_warn_invalid_xdp_action(u32 act) +{ + WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); + +static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1811,7 +2914,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), dst_reg, src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); @@ -1852,7 +2955,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, dst_reg, src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); prog->cb_access = 1; @@ -1869,8 +2972,24 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ctx_off -= offsetof(struct __sk_buff, tc_classid); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); - WARN_ON(type != BPF_WRITE); - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + else + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + break; + + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + dst_reg, src_reg, + offsetof(struct sk_buff, data)); + break; + + case offsetof(struct __sk_buff, data_end): + ctx_off -= offsetof(struct __sk_buff, data_end); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, + ctx_off); break; case offsetof(struct __sk_buff, tc_index): @@ -1896,31 +3015,137 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + } + + return insn - insn_buf; +} + +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + dst_reg, src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + offsetof(struct net_device, ifindex)); + break; + default: + return sk_filter_convert_ctx_access(type, dst_reg, src_reg, + ctx_off, insn_buf, prog); + } + + return insn - insn_buf; +} + +static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct xdp_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), + dst_reg, src_reg, + offsetof(struct xdp_buff, data)); + break; + case offsetof(struct xdp_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), + dst_reg, src_reg, + offsetof(struct xdp_buff, data_end)); + break; + } + + return insn - insn_buf; +} + static const struct bpf_verifier_ops sk_filter_ops = { - .get_func_proto = sk_filter_func_proto, - .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { - .get_func_proto = tc_cls_act_func_proto, - .is_valid_access = tc_cls_act_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + +static const struct bpf_verifier_ops xdp_ops = { + .get_func_proto = xdp_func_proto, + .is_valid_access = xdp_is_valid_access, + .convert_ctx_access = xdp_convert_ctx_access, +}; + +static const struct bpf_verifier_ops cg_skb_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops cg_sock_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = sock_filter_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { - .ops = &sk_filter_ops, - .type = BPF_PROG_TYPE_SOCKET_FILTER, + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_CLS, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, }; static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_ACT, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, +}; + +static struct bpf_prog_type_list xdp_type __read_mostly = { + .ops = &xdp_ops, + .type = BPF_PROG_TYPE_XDP, +}; + +static struct bpf_prog_type_list cg_skb_type __read_mostly = { + .ops = &cg_skb_ops, + .type = BPF_PROG_TYPE_CGROUP_SKB, +}; + +static struct bpf_prog_type_list cg_sock_type __read_mostly = { + .ops = &cg_sock_ops, + .type = BPF_PROG_TYPE_CGROUP_SOCK }; static int __init register_sk_filter_ops(void) @@ -1928,12 +3153,15 @@ static int __init register_sk_filter_ops(void) bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); + bpf_register_prog_type(&xdp_type); + bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&cg_sock_type); return 0; } late_initcall(register_sk_filter_ops); -int __sk_detach_filter(struct sock *sk, bool locked) +int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; @@ -1941,7 +3169,8 @@ int __sk_detach_filter(struct sock *sk, bool locked) if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; - filter = rcu_dereference_protected(sk->sk_filter, locked); + filter = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); @@ -1950,12 +3179,7 @@ int __sk_detach_filter(struct sock *sk, bool locked) return ret; } -EXPORT_SYMBOL_GPL(__sk_detach_filter); - -int sk_detach_filter(struct sock *sk) -{ - return __sk_detach_filter(sk, sock_owned_by_user(sk)); -} +EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) @@ -1966,7 +3190,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (!filter) goto out; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b4f2c30e8313..9c31d7cde80b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -180,15 +180,16 @@ ip: ip_proto = iph->protocol; - if (!dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) - break; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bfd16b515ab9..31393e30b6b4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -18,6 +18,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> +#include <linux/kmemleak.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> @@ -325,12 +326,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) + if (size <= PAGE_SIZE) { buckets = kzalloc(size, GFP_ATOMIC); - else + } else { buckets = (struct neighbour __rcu **) __get_free_pages(GFP_ATOMIC | __GFP_ZERO, get_order(size)); + kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); + } if (!buckets) { kfree(ret); return NULL; @@ -350,10 +353,12 @@ static void neigh_hash_free_rcu(struct rcu_head *head) size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) + if (size <= PAGE_SIZE) { kfree(buckets); - else + } else { + kmemleak_free(buckets); free_pages((unsigned long)buckets, get_order(size)); + } kfree(nht); } @@ -592,7 +597,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, ASSERT_RTNL(); - n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -1053,7 +1058,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if (update) { hh = &neigh->hh; - if (hh->hh_len) { + if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); @@ -1229,7 +1234,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, * we can reinject the packet there. */ n2 = NULL; - if (dst) { + if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; @@ -1318,7 +1323,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) struct net_device *dev = neigh->dev; unsigned int seq; - if (dev->header_ops->cache && !neigh->hh.hh_len) + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { @@ -1832,8 +1837,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, goto nla_put_failure; { unsigned long now = jiffies; - unsigned int flush_delta = now - tbl->last_flush; - unsigned int rand_delta = now - tbl->last_rand; + long flush_delta = now - tbl->last_flush; + long rand_delta = now - tbl->last_rand; struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, @@ -2793,6 +2798,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } + (*pos)++; return NULL; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 77969b71a50a..d6161fba15c3 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -208,12 +208,23 @@ static const struct file_operations softnet_seq_fops = { .release = seq_release, }; -static void *ptype_get_idx(loff_t pos) +static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { + struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; + struct net_device *dev; loff_t i = 0; int t; + for_each_netdev_rcu(seq_file_net(seq), dev) { + ptype_list = &dev->ptype_all; + list_for_each_entry_rcu(pt, ptype_list, list) { + if (i == pos) + return pt; + ++i; + } + } + list_for_each_entry_rcu(pt, &ptype_all, list) { if (i == pos) return pt; @@ -234,22 +245,40 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net_device *dev; struct packet_type *pt; struct list_head *nxt; int hash; ++*pos; if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); + return ptype_get_idx(seq, 0); pt = v; nxt = pt->list.next; + if (pt->dev) { + if (nxt != &pt->dev->ptype_all) + goto found; + + dev = pt->dev; + for_each_netdev_continue_rcu(seq_file_net(seq), dev) { + if (!list_empty(&dev->ptype_all)) { + nxt = dev->ptype_all.next; + goto found; + } + } + + nxt = ptype_all.next; + goto ptype_all; + } + if (pt->type == htons(ETH_P_ALL)) { +ptype_all: if (nxt != &ptype_all) goto found; hash = 0; @@ -278,7 +307,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 579d351f6ddd..c976fd132c3b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -999,7 +999,7 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, trans_timeout = queue->trans_timeout; spin_unlock_irq(&queue->_xmit_lock); - return sprintf(buf, "%lu", trans_timeout); + return sprintf(buf, fmt_ulong, trans_timeout); } #ifdef CONFIG_XPS diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 087ce1598b74..3283b97cff8d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -130,8 +130,10 @@ static void ops_exit_list(const struct pernet_operations *ops, { struct net *net; if (ops->exit) { - list_for_each_entry(net, net_exit_list, exit_list) + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); + cond_resched(); + } } if (ops->exit_batch) ops->exit_batch(net_exit_list); @@ -778,7 +780,8 @@ static int __init net_ns_init(void) mutex_unlock(&net_mutex); - register_pernet_subsys(&net_ns_ops); + if (register_pernet_subsys(&net_ns_ops)) + panic("Could not register network namespace subsystems"); rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, @@ -1018,11 +1021,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *netns_owner(struct ns_common *ns) +{ + return to_net_ns(ns)->user_ns; +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .owner = netns_owner, }; #endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index d9ee8d08a3a6..0260c84ed83c 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -61,9 +61,12 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) int err; struct socket *sock = sock_from_file(file, &err); - if (sock) - sock->sk->sk_classid = (u32)(unsigned long)v; - + if (sock) { + spin_lock(&cgroup_sk_update_lock); + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, + (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } @@ -100,6 +103,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, { struct cgroup_cls_state *cs = css_cls_state(css); + cgroup_sk_alloc_disable(); + cs->classid = (u32)value; update_classid(css, (void *)(unsigned long)cs->classid); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 8f445f6e8328..8d612cefb5f3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/if_vlan.h> +#include <net/dsa.h> #include <net/tcp.h> #include <net/udp.h> #include <net/addrconf.h> @@ -178,7 +179,7 @@ static void poll_napi(struct net_device *dev) { struct napi_struct *napi; - list_for_each_entry(napi, &dev->napi_list, dev_list) { + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { poll_one_napi(napi); @@ -661,15 +662,15 @@ EXPORT_SYMBOL_GPL(__netpoll_setup); int netpoll_setup(struct netpoll *np) { - struct net_device *ndev = NULL; + struct net_device *ndev = NULL, *dev = NULL; + struct net *net = current->nsproxy->net_ns; struct in_device *in_dev; int err; rtnl_lock(); - if (np->dev_name[0]) { - struct net *net = current->nsproxy->net_ns; + if (np->dev_name[0]) ndev = __dev_get_by_name(net, np->dev_name); - } + if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); err = -ENODEV; @@ -677,6 +678,19 @@ int netpoll_setup(struct netpoll *np) } dev_hold(ndev); + /* bring up DSA management network devices up first */ + for_each_netdev(net, dev) { + if (!netdev_uses_dsa(dev)) + continue; + + err = dev_change_flags(dev, dev->flags | IFF_UP); + if (err < 0) { + np_err(np, "%s failed to open %s\n", + np->dev_name, dev->name); + goto put; + } + } + if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 40fd09fe06ae..3c1d9a1a5f7d 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -27,6 +27,12 @@ #include <linux/fdtable.h> +/* + * netprio allocates per-net_device priomap array which is indexed by + * css->id. Limiting css ID to 16bits doesn't lose anything. + */ +#define NETPRIO_ID_MAX USHRT_MAX + #define PRIOMAP_MIN_SZ 128 /* @@ -144,6 +150,9 @@ static int cgrp_css_online(struct cgroup_subsys_state *css) struct net_device *dev; int ret = 0; + if (css->id > NETPRIO_ID_MAX) + return -ENOSPC; + if (!parent_css) return 0; @@ -200,6 +209,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; + cgroup_sk_alloc_disable(); + rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -213,8 +224,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); - if (sock) - sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; + if (sock) { + spin_lock(&cgroup_sk_update_lock); + sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, + (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } @@ -223,6 +238,8 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct task_struct *p; struct cgroup_subsys_state *css; + cgroup_sk_alloc_disable(); + cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->cgroup->id; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 4ea957c1e7ee..5d0759e2102e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3519,7 +3519,7 @@ static int pktgen_thread_worker(void *arg) struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - BUG_ON(smp_processor_id() != cpu); + WARN_ON(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a9da58204afa..7d6fe9ba9a24 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -253,6 +253,7 @@ int rtnl_unregister(int protocol, int msgtype) rtnl_msg_handlers[protocol][msgindex].doit = NULL; rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + rtnl_msg_handlers[protocol][msgindex].calcit = NULL; return 0; } @@ -2104,7 +2105,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, 0U); + __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags)); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; __dev_notify_flags(dev, old_flags, ~0U); @@ -3239,6 +3240,10 @@ static int rtnl_bridge_notify(struct net_device *dev) if (err < 0) goto errout; + /* Notification info is only filled for bridge ports, not the bridge + * device itself. Therefore, a zero notification length is valid and + * should not result in an error. + */ if (!skb->len) goto errout; diff --git a/net/core/scm.c b/net/core/scm.c index dce0acb929f1..2696aefdc148 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -295,8 +295,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) /* Bump the usage count and install the file. */ sock = sock_from_file(fp[i], &err); if (sock) { - sock_update_netprioidx(sock->sk); - sock_update_classid(sock->sk); + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); } fd_install(new_fd, get_file(fp[i])); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 47520c651632..15a7a2667e1e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -429,7 +429,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (IS_ENABLED(CONFIG_FORCE_ALLOC_FROM_DMA_ZONE)) gfp_mask |= GFP_DMA; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) @@ -506,13 +510,17 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct page_frag_cache *nc; struct sk_buff *skb; void *data; len += NET_SKB_PAD + NET_IP_ALIGN; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) @@ -520,6 +528,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, goto skb_success; } + nc = this_cpu_ptr(&napi_alloc_cache); len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); @@ -1542,6 +1551,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) skb->csum = csum_block_sub(skb->csum, skb_checksum(skb, len, delta, 0), len); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; } return __pskb_trim(skb, len); } @@ -2254,8 +2269,11 @@ skb_zerocopy_headlen(const struct sk_buff *from) if (!from->head_frag || skb_headlen(from) < L1_CACHE_BYTES || - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { hlen = skb_headlen(from); + if (!hlen) + hlen = from->len; + } if (skb_has_frag_list(from)) hlen = from->len; @@ -2639,7 +2657,19 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + int ret = 0; + + if (skb_cloned(skb)) { + /* Save and restore truesize: pskb_expand_head() may reallocate + * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we + * cannot change truesize at this point. + */ + unsigned int save_truesize = skb->truesize; + + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + skb->truesize = save_truesize; + } + return ret; } /** @@ -3075,6 +3105,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int pos; int dummy; + if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && + (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { + /* gso_size is untrusted, and we have a frag_list with a linear + * non head_frag head. + * + * (we assume checking the first list_skb member suffices; + * i.e if either of the list_skb members have non head_frag + * head, then the first one has too). + * + * If head_skb's headlen does not fit requested gso_size, it + * means that the frag_list members do NOT terminate on exact + * gso_size boundaries. Hence we cannot perform skb_frag_t page + * sharing. Therefore we must fallback to copying the frag_list + * skbs; we do so by disabling SG. + */ + if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) + features &= ~NETIF_F_SG; + } + __skb_push(head_skb, doffset); proto = skb_network_protocol(head_skb, &dummy); if (unlikely(!proto)) @@ -3092,9 +3141,13 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int hsize; int size; - len = head_skb->len - offset; - if (len > mss) - len = mss; + if (unlikely(mss == GSO_BY_FRAGS)) { + len = list_skb->len; + } else { + len = head_skb->len - offset; + if (len > mss) + len = mss; + } hsize = skb_headlen(head_skb) - offset; if (hsize < 0) @@ -4396,8 +4449,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb) skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto err_free; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) goto err_free; vhdr = (struct vlan_hdr *)skb->data; @@ -4477,9 +4530,8 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (unlikely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4487,9 +4539,8 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (likely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; vlan_proto = skb->protocol; diff --git a/net/core/sock.c b/net/core/sock.c index 14dcc631f922..0ac6e6d306f7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -134,6 +134,7 @@ #include <linux/sock_diag.h> #include <linux/filter.h> +#include <net/sock_reuseport.h> #include <trace/events/sock.h> @@ -484,11 +485,12 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_rcv_skb); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested, unsigned int trim_cap) { int rc = NET_RX_SUCCESS; - if (sk_filter(sk, skb)) + if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; @@ -524,7 +526,7 @@ discard_and_relse: kfree_skb(skb); goto out; } -EXPORT_SYMBOL(sk_receive_skb); +EXPORT_SYMBOL(__sk_receive_skb); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { @@ -933,6 +935,32 @@ set_rcvbuf: } break; + case SO_ATTACH_REUSEPORT_CBPF: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_reuseport_attach_filter(&fprog, sk); + } + break; + + case SO_ATTACH_REUSEPORT_EBPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_reuseport_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -1013,7 +1041,6 @@ set_rcvbuf: } EXPORT_SYMBOL(sock_setsockopt); - static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) { @@ -1034,6 +1061,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, union { int val; + u64 val64; struct linger ling; struct timeval tm; } v; @@ -1173,7 +1201,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + spin_unlock(&sk->sk_peer_lock); + if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; @@ -1264,6 +1296,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_incoming_cpu; break; + + case SO_COOKIE: + lv = sizeof(u64); + if (len < lv) + return -EINVAL; + v.val64 = sock_gen_cookie(sk); + break; default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1385,6 +1424,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) owner = prot->owner; slab = prot->slab; + cgroup_sk_free(&sk->sk_cgrp_data); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); @@ -1393,17 +1433,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) -void sock_update_netprioidx(struct sock *sk) -{ - if (in_interrupt()) - return; - - sk->sk_cgrp_prioidx = task_netprioidx(current); -} -EXPORT_SYMBOL_GPL(sock_update_netprioidx); -#endif - /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1432,8 +1461,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk); - sock_update_netprioidx(sk); + cgroup_sk_alloc(&sk->sk_cgrp_data); + sock_update_classid(&sk->sk_cgrp_data); + sock_update_netprioidx(&sk->sk_cgrp_data); + sk_tx_queue_clear(sk); } return sk; @@ -1457,6 +1488,8 @@ static void __sk_destruct(struct rcu_head *head) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); @@ -1469,9 +1502,10 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_frag.page = NULL; } - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); + if (likely(sk->sk_net_refcnt)) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); @@ -1561,6 +1595,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; sock_reset_flag(newsk, SOCK_DONE); + cgroup_sk_clone(&newsk->sk_cgrp_data); + skb_queue_head_init(&newsk->sk_error_queue); filter = rcu_dereference_protected(newsk->sk_filter, 1); @@ -1586,12 +1622,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk = NULL; goto out; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + + cgroup_sk_alloc(&newsk->sk_cgrp_data); + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -1612,6 +1652,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); newsk->sk_wq = NULL; sk_update_clone(sk, newsk); @@ -2134,7 +2175,7 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) } if (sk_has_memory_pressure(sk)) { - int alloc; + u64 alloc; if (!sk_under_memory_pressure(sk)) return 1; @@ -2284,6 +2325,27 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct * } EXPORT_SYMBOL(sock_no_mmap); +/* + * When a file is received (via SCM_RIGHTS, etc), we must bump the + * various sock-based usage counts. + */ +void __receive_sock(struct file *file) +{ + struct socket *sock; + int error; + + /* + * The resulting value of "error" is ignored here since we only + * need to take action when the file is a socket and testing + * "sock" for NULL is sufficient. + */ + sock = sock_from_file(file, &error); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } +} + ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { ssize_t res; @@ -2433,6 +2495,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 9653798da293..e25c72918b96 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -19,7 +19,7 @@ static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; -static u64 sock_gen_cookie(struct sock *sk) +u64 sock_gen_cookie(struct sock *sk) { while (1) { u64 res = atomic64_read(&sk->sk_cookie); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c new file mode 100644 index 000000000000..2ba3ae7720a8 --- /dev/null +++ b/net/core/sock_reuseport.c @@ -0,0 +1,269 @@ +/* + * To speed up listener socket lookup, create an array to store all sockets + * listening on the same port. This allows a decision to be made after finding + * the first socket. An optional BPF program can also be configured for + * selecting the socket index from the array of available sockets. + */ + +#include <net/sock_reuseport.h> +#include <linux/bpf.h> +#include <linux/rcupdate.h> + +#define INIT_SOCKS 128 + +static DEFINE_SPINLOCK(reuseport_lock); + +static struct sock_reuseport *__reuseport_alloc(u16 max_socks) +{ + size_t size = sizeof(struct sock_reuseport) + + sizeof(struct sock *) * max_socks; + struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + + if (!reuse) + return NULL; + + reuse->max_socks = max_socks; + + RCU_INIT_POINTER(reuse->prog, NULL); + return reuse; +} + +int reuseport_alloc(struct sock *sk) +{ + struct sock_reuseport *reuse; + + /* bh lock used since this function call may precede hlist lock in + * soft irq of receive path or setsockopt from process context + */ + spin_lock_bh(&reuseport_lock); + + /* Allocation attempts can occur concurrently via the setsockopt path + * and the bind/hash path. Nothing to do when we lose the race. + */ + if (rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock))) + goto out; + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + + reuse->socks[0] = sk; + reuse->num_socks = 1; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + +out: + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_alloc); + +static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) +{ + struct sock_reuseport *more_reuse; + u32 more_socks_size, i; + + more_socks_size = reuse->max_socks * 2U; + if (more_socks_size > U16_MAX) + return NULL; + + more_reuse = __reuseport_alloc(more_socks_size); + if (!more_reuse) + return NULL; + + more_reuse->max_socks = more_socks_size; + more_reuse->num_socks = reuse->num_socks; + more_reuse->prog = reuse->prog; + + memcpy(more_reuse->socks, reuse->socks, + reuse->num_socks * sizeof(struct sock *)); + + for (i = 0; i < reuse->num_socks; ++i) + rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, + more_reuse); + + /* Note: we use kfree_rcu here instead of reuseport_free_rcu so + * that reuse and more_reuse can temporarily share a reference + * to prog. + */ + kfree_rcu(reuse, rcu); + return more_reuse; +} + +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + +/** + * reuseport_add_sock - Add a socket to the reuseport group of another. + * @sk: New socket to add to the group. + * @sk2: Socket belonging to the existing reuseport group. + * May return ENOMEM and not add socket to group under memory pressure. + */ +int reuseport_add_sock(struct sock *sk, struct sock *sk2) +{ + struct sock_reuseport *old_reuse, *reuse; + + if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { + int err = reuseport_alloc(sk2); + + if (err) + return err; + } + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_socks != 1) { + spin_unlock_bh(&reuseport_lock); + return -EBUSY; + } + + if (reuse->num_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + } + + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_select_sock() */ + smp_wmb(); + reuse->num_socks++; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + if (old_reuse) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + return 0; +} +EXPORT_SYMBOL(reuseport_add_sock); + +void reuseport_detach_sock(struct sock *sk) +{ + struct sock_reuseport *reuse; + int i; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + + for (i = 0; i < reuse->num_socks; i++) { + if (reuse->socks[i] == sk) { + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + if (reuse->num_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + break; + } + } + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_detach_sock); + +static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) +{ + struct sk_buff *nskb = NULL; + u32 index; + + if (skb_shared(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + skb = nskb; + } + + /* temporarily advance data past protocol header */ + if (!pskb_pull(skb, hdr_len)) { + kfree_skb(nskb); + return NULL; + } + index = bpf_prog_run_save_cb(prog, skb); + __skb_push(skb, hdr_len); + + consume_skb(nskb); + + if (index >= socks) + return NULL; + + return reuse->socks[index]; +} + +/** + * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. + * @sk: First socket in the group. + * @hash: When no BPF filter is available, use this hash to select. + * @skb: skb to run through BPF filter. + * @hdr_len: BPF filter expects skb data pointer at payload data. If + * the skb does not yet point at the payload, this parameter represents + * how far the pointer needs to advance to reach the payload. + * Returns a socket that should receive the packet (or NULL on error). + */ +struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len) +{ + struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *sk2 = NULL; + u16 socks; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + + /* if memory allocation failed or add call is not yet complete */ + if (!reuse) + goto out; + + prog = rcu_dereference(reuse->prog); + socks = READ_ONCE(reuse->num_socks); + if (likely(socks)) { + /* paired with smp_wmb() in reuseport_add_sock() */ + smp_rmb(); + + if (prog && skb) + sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + else + sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + } + +out: + rcu_read_unlock(); + return sk2; +} +EXPORT_SYMBOL(reuseport_select_sock); + +struct bpf_prog * +reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(reuse->prog, prog); + spin_unlock_bh(&reuseport_lock); + + return old_prog; +} +EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/core/stream.c b/net/core/stream.c index 3089b014bb53..2c50c71cb806 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -194,9 +194,6 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); - /* Next, the error queue. */ - __skb_queue_purge(&sk->sk_error_queue); - /* Next, the write queue. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 32898247d8bf..f62e177267c3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,9 +24,12 @@ static int zero = 0; static int one = 1; +static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static long long_one __maybe_unused = 1; +static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -231,6 +234,52 @@ static int proc_do_rss_key(struct ctl_table *table, int write, return proc_dostring(&fake_table, write, buffer, lenp, ppos); } +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif /* CONFIG_HAVE_EBPF_JIT */ + +static int +proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -292,13 +341,34 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - .proc_handler = proc_dointvec -#else - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_bpf_enable, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, -#endif +# else + .extra1 = &zero, + .extra2 = &two, +# endif + }, +# ifdef CONFIG_HAVE_EBPF_JIT + { + .procname = "bpf_jit_harden", + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &two, + }, +# endif + { + .procname = "bpf_jit_limit", + .data = &bpf_jit_limit, + .maxlen = sizeof(long), + .mode = 0600, + .proc_handler = proc_dolongvec_minmax_bpf_restricted, + .extra1 = &long_one, + .extra2 = &bpf_jit_limit_max, }, #endif { diff --git a/net/core/utils.c b/net/core/utils.c index 3d17ca8b4744..13eb3552de07 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -316,6 +316,23 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace4); +/** + * inet_proto_csum_replace16 - update layer 4 header checksum field + * @sum: Layer 4 header checksum field + * @skb: sk_buff for the packet + * @from: old IPv6 address + * @to: new IPv6 address + * @pseudohdr: True if layer 4 header checksum includes pseudoheader + * + * Update layer 4 header as per the update in IPv6 src/dst address. + * + * There is no need to update skb->csum in this function, because update in two + * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other + * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to + * update skb->csum, because update in 3 fields a.) IPv4 src/dst address, + * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as + * L4 Header checksum for skb->csum calculation. + */ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr) @@ -327,9 +344,6 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { *sum = csum_fold(csum_partial(diff, sizeof(diff), ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); } else if (pseudohdr) *sum = ~csum_fold(csum_partial(diff, sizeof(diff), csum_unfold(*sum))); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 6fe2b615518c..426c30f9fdb0 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1725,6 +1725,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; + if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b0e28d24e1a7..e50fc19690c8 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -44,9 +44,9 @@ extern bool dccp_debug; #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else -#define dccp_pr_debug(format, a...) -#define dccp_pr_debug_cat(format, a...) -#define dccp_debug(format, a...) +#define dccp_pr_debug(format, a...) do {} while (0) +#define dccp_pr_debug_cat(format, a...) do {} while (0) +#define dccp_debug(format, a...) do {} while (0) #endif extern struct inet_hashinfo dccp_hashinfo; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index f227f002c73d..db87d9f58019 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -738,7 +738,12 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) return -ENOMEM; - return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); + if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { + kfree(fval.sp.vec); + return -ENOMEM; + } + + return 0; } /** diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ef4c44d46293..11d79958a767 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -465,7 +465,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, .fl4_dport = dccp_hdr(skb)->dccph_sport, }; - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); @@ -804,7 +804,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) } lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, + sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " @@ -868,7 +868,7 @@ lookup: goto discard_and_relse; nf_reset(skb); - return sk_receive_skb(sk, skb, 1); + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d2caa4d69159..64f0e88fe0e8 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -202,14 +202,14 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req fl6.flowi6_oif = ireq->ir_iif; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; @@ -273,10 +273,10 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) fl6.flowi6_oif = inet6_iif(rxskb); fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); ip6_xmit(ctl_sk, skb, &fl6, NULL, 0); @@ -313,6 +313,11 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) return 0; /* discard, don't send a reset here */ + if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { + IP6_INC_STATS_BH(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); + return 0; + } + if (dccp_bad_service_code(sk, service)) { dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; goto drop; @@ -679,7 +684,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, + sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); if (!sk) { @@ -741,7 +746,7 @@ lookup: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - return sk_receive_skb(sk, skb, 1) ? -1 : 0; + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -874,12 +879,12 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); final_p = fl6_update_dst(&fl6, opt, &final); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; @@ -1005,7 +1010,7 @@ static struct proto dccp_v6_prot = { .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 68eed344b471..1f03a590288d 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -92,6 +92,8 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; newdp->dccps_service_list = NULL; + newdp->dccps_hc_rx_ccid = NULL; + newdp->dccps_hc_tx_ccid = NULL; newdp->dccps_service = dreq->dreq_service; newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 9d8fcdefefc0..ee297964fcd2 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -823,7 +823,7 @@ static int dn_auto_bind(struct socket *sock) static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) { struct dn_scp *scp = DN_SK(sk); - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int err; if (scp->state != DN_CR) @@ -833,11 +833,11 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk)); dn_send_conn_conf(sk, allocation); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); for(;;) { release_sock(sk); if (scp->state == DN_CC) - *timeo = schedule_timeout(*timeo); + *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); lock_sock(sk); err = 0; if (scp->state == DN_RUN) @@ -851,9 +851,8 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) err = -EAGAIN; if (!*timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); if (err == 0) { sk->sk_socket->state = SS_CONNECTED; } else if (scp->state != DN_CC) { @@ -865,7 +864,7 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) static int dn_wait_run(struct sock *sk, long *timeo) { struct dn_scp *scp = DN_SK(sk); - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int err = 0; if (scp->state == DN_RUN) @@ -874,11 +873,11 @@ static int dn_wait_run(struct sock *sk, long *timeo) if (!*timeo) return -EALREADY; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); for(;;) { release_sock(sk); if (scp->state == DN_CI || scp->state == DN_CC) - *timeo = schedule_timeout(*timeo); + *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); lock_sock(sk); err = 0; if (scp->state == DN_RUN) @@ -892,9 +891,8 @@ static int dn_wait_run(struct sock *sk, long *timeo) err = -ETIMEDOUT; if (!*timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); out: if (err == 0) { sk->sk_socket->state = SS_CONNECTED; @@ -1039,16 +1037,16 @@ static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt) static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sk_buff *skb = NULL; int err = 0; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); for(;;) { release_sock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb == NULL) { - *timeo = schedule_timeout(*timeo); + *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); skb = skb_dequeue(&sk->sk_receive_queue); } lock_sock(sk); @@ -1063,9 +1061,8 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) err = -EAGAIN; if (!*timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); return skb == NULL ? ERR_PTR(err) : skb; } diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index b2c26b081134..80554e7e9a0f 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -55,7 +55,7 @@ #include <net/dn_neigh.h> #include <net/dn_fib.h> -#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn)) +#define DN_IFREQ_SIZE (offsetof(struct ifreq, ifr_ifru) + sizeof(struct sockaddr_dn)) static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00}; static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00}; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4256ac95a141..061c3939f93b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1031,7 +1031,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, p->phy_interface = mode; phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); - if (of_phy_is_fixed_link(port_dn)) { + if (!phy_dn && of_phy_is_fixed_link(port_dn)) { /* In the case of a fixed PHY, the DT node associated * to the fixed PHY is the Port DT node */ @@ -1041,7 +1041,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, return ret; } phy_is_fixed = true; - phy_dn = port_dn; + phy_dn = of_node_get(port_dn); } if (ds->drv->get_phy_flags) @@ -1060,6 +1060,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, ret = dsa_slave_phy_connect(p, slave_dev, phy_id); if (ret) { netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); + of_node_put(phy_dn); return ret; } } else { @@ -1068,6 +1069,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_flags, p->phy_interface); } + + of_node_put(phy_dn); } if (p->phy && phy_is_fixed) diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e2aadb73111d..657f7b1af315 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -84,6 +84,8 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; + skb->offload_fwd_mark = 1; + return skb; out_free: diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 52dcd414c2af..3f51b4e590b1 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -235,7 +235,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); - hh->hh_len = ETH_HLEN; + + /* Pairs with READ_ONCE() in neigh_resolve_output(), + * neigh_hh_output() and neigh_update_hhs(). + */ + smp_store_release(&hh->hh_len, ETH_HLEN); + return 0; } EXPORT_SYMBOL(eth_header_cache); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 943378d6e4c3..8dd239214a14 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -289,6 +289,8 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(*hsr_stag)); diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 46415839e67e..afcde16a94e2 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -297,7 +297,8 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, node_dst = find_node_by_AddrA(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { - WARN_ONCE(1, "%s: Unknown node\n", __func__); + if (net_ratelimit()) + netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } if (port->type != node_dst->AddrB_port) @@ -455,13 +456,9 @@ int hsr_get_node_data(struct hsr_priv *hsr, struct hsr_port *port; unsigned long tdiff; - - rcu_read_lock(); node = find_node_by_AddrA(&hsr->node_db, addr); - if (!node) { - rcu_read_unlock(); - return -ENOENT; /* No such entry */ - } + if (!node) + return -ENOENT; ether_addr_copy(addr_b, node->MacAddressB); @@ -496,7 +493,5 @@ int hsr_get_node_data(struct hsr_priv *hsr, *addr_b_ifindex = -1; } - rcu_read_unlock(); - return 0; } diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index a2c7e4c0ac1e..0a9a178f221a 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -132,6 +132,7 @@ static struct genl_family hsr_genl_family = { .name = "HSR", .version = 1, .maxattr = HSR_A_MAX, + .netnsok = true, }; static const struct genl_multicast_group hsr_mcgrps[] = { @@ -259,17 +260,16 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; - + goto rcu_unlock; /* Send reply */ - - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -321,12 +321,10 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (port) res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; @@ -336,20 +334,22 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; + rcu_read_unlock(); + genlmsg_end(skb_out, msg_head); genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid); return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL); return 0; @@ -359,6 +359,7 @@ nla_put_failure: /* Fall through */ fail: + rcu_read_unlock(); return res; } @@ -366,16 +367,14 @@ fail: */ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) { - /* For receiving */ - struct nlattr *na; + unsigned char addr[ETH_ALEN]; struct net_device *hsr_dev; - - /* For sending */ struct sk_buff *skb_out; - void *msg_head; struct hsr_priv *hsr; - void *pos; - unsigned char addr[ETH_ALEN]; + bool restart = false; + struct nlattr *na; + void *pos = NULL; + void *msg_head; int res; if (!info) @@ -385,17 +384,17 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; - + goto rcu_unlock; +restart: /* Send reply */ - - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -409,18 +408,26 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) goto nla_put_failure; } - res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); - if (res < 0) - goto nla_put_failure; + if (!restart) { + res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); + if (res < 0) + goto nla_put_failure; + } hsr = netdev_priv(hsr_dev); - rcu_read_lock(); - pos = hsr_get_next_node(hsr, NULL, addr); + if (!pos) + pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); if (res < 0) { - rcu_read_unlock(); + if (res == -EMSGSIZE) { + genlmsg_end(skb_out, msg_head); + genlmsg_unicast(genl_info_net(info), skb_out, + info->snd_portid); + restart = true; + goto restart; + } goto nla_put_failure; } pos = hsr_get_next_node(hsr, pos, addr); @@ -432,15 +439,18 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL); return 0; nla_put_failure: - kfree_skb(skb_out); + nlmsg_free(skb_out); /* Fall through */ fail: + rcu_read_unlock(); return res; } diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 7d37366cc695..db14b452adfa 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -30,6 +30,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) rcu_read_lock(); /* hsr->node_db, hsr->ports */ port = hsr_port_get_rcu(skb->dev); + if (!port) + goto finish_pass; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ @@ -147,16 +149,16 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, if (port == NULL) return -ENOMEM; + port->hsr = hsr; + port->dev = dev; + port->type = type; + if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(dev, port); if (res) goto fail_dev_setup; } - port->hsr = hsr; - port->dev = dev; - port->type = type; - list_add_tail_rcu(&port->port_list, &hsr->ports); synchronize_rcu(); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6183730d38db..e728dae467c3 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -634,7 +634,7 @@ err_sysctl: void lowpan_net_frag_exit(void) { - inet_frags_fini(&lowpan_frags); lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); + inet_frags_fini(&lowpan_frags); } diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 3503c38954f9..fe31df8dc804 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -557,9 +557,7 @@ ieee802154_llsec_parse_key_id(struct genl_info *info, desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]); if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) { - if (!info->attrs[IEEE802154_ATTR_PAN_ID] && - !(info->attrs[IEEE802154_ATTR_SHORT_ADDR] || - info->attrs[IEEE802154_ATTR_HW_ADDR])) + if (!info->attrs[IEEE802154_ATTR_PAN_ID]) return -EINVAL; desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]); @@ -568,6 +566,9 @@ ieee802154_llsec_parse_key_id(struct genl_info *info, desc->device_addr.mode = IEEE802154_ADDR_SHORT; desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]); } else { + if (!info->attrs[IEEE802154_ATTR_HW_ADDR]) + return -EINVAL; + desc->device_addr.mode = IEEE802154_ADDR_LONG; desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); } @@ -684,8 +685,10 @@ int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info) nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, be32_to_cpu(params.frame_counter)) || - ieee802154_llsec_fill_key_id(msg, ¶ms.out_key)) + ieee802154_llsec_fill_key_id(msg, ¶ms.out_key)) { + rc = -ENOBUFS; goto out_free; + } dev_put(dev); diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 77d73014bde3..11f53dc0c1c0 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -249,8 +249,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) } if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || - nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name)) + nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name)) { + rc = -EMSGSIZE; goto nla_put_failure; + } dev_put(dev); wpan_phy_put(phy); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 16ef0d9f566e..b2ff2f7329c3 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -843,8 +843,13 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, goto nla_put_failure; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) + goto out; + if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0) goto nla_put_failure; + +out: #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ genlmsg_end(msg, hdr); @@ -1367,6 +1372,9 @@ static int nl802154_set_llsec_params(struct sk_buff *skb, u32 changed = 0; int ret; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) + return -EOPNOTSUPP; + if (info->attrs[NL802154_ATTR_SEC_ENABLED]) { u8 enabled; @@ -1473,6 +1481,11 @@ nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { + err = skb->len; + goto out_err; + } + if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; @@ -1527,7 +1540,8 @@ static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) struct ieee802154_llsec_key_id id = { }; u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { }; - if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + if (!info->attrs[NL802154_ATTR_SEC_KEY] || + nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy)) return -EINVAL; @@ -1577,7 +1591,8 @@ static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; struct ieee802154_llsec_key_id id; - if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + if (!info->attrs[NL802154_ATTR_SEC_KEY] || + nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy)) return -EINVAL; @@ -1643,6 +1658,11 @@ nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { + err = skb->len; + goto out_err; + } + if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; @@ -1730,6 +1750,9 @@ static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info) struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_device dev_desc; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) + return -EOPNOTSUPP; + if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE], &dev_desc) < 0) return -EINVAL; @@ -1745,7 +1768,8 @@ static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; __le64 extended_addr; - if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, + if (!info->attrs[NL802154_ATTR_SEC_DEVICE] || + nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVICE], nl802154_dev_policy)) return -EINVAL; @@ -1815,6 +1839,11 @@ nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { + err = skb->len; + goto out_err; + } + if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; @@ -1872,6 +1901,9 @@ static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info struct ieee802154_llsec_device_key key; __le64 extended_addr; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) + return -EOPNOTSUPP; + if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], @@ -1905,7 +1937,8 @@ static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info struct ieee802154_llsec_device_key key; __le64 extended_addr; - if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, + if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || + nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy)) return -EINVAL; @@ -1980,6 +2013,11 @@ nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { + err = skb->len; + goto out_err; + } + if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; @@ -2065,6 +2103,9 @@ static int nl802154_add_llsec_seclevel(struct sk_buff *skb, struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_seclevel sl; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) + return -EOPNOTSUPP; + if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], &sl) < 0) return -EINVAL; @@ -2080,6 +2121,9 @@ static int nl802154_del_llsec_seclevel(struct sk_buff *skb, struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_seclevel sl; + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) + return -EOPNOTSUPP; + if (!info->attrs[NL802154_ATTR_SEC_LEVEL] || llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], &sl) < 0) diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 35c432668454..040983fc15da 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -30,7 +30,13 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, }, [IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, }, [IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, }, + [IEEE802154_ATTR_BCN_ORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_SF_ORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_PAN_COORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, }, + [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, }, [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_DEV_TYPE] = { .type = NLA_U8, }, [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, }, [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, }, [IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, }, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index cb6c0772ea36..6383627b783e 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); -static void raw_hash(struct sock *sk) +static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&raw_lock); + + return 0; } static void raw_unhash(struct sock *sk) @@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk) return container_of(sk, struct dgram_sock, sk); } -static void dgram_hash(struct sock *sk) +static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&dgram_lock); + + return 0; } static void dgram_unhash(struct sock *sk) @@ -983,6 +987,11 @@ static const struct proto_ops ieee802154_dgram_ops = { #endif }; +static void ieee802154_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + /* Create a socket. Initialise the socket, blank the addresses * set the state. */ @@ -1023,14 +1032,19 @@ static int ieee802154_create(struct net *net, struct socket *sock, sock->ops = ops; sock_init_data(sock, sk); - /* FIXME: sk->sk_destruct */ + sk->sk_destruct = ieee802154_sock_destruct; sk->sk_family = PF_IEEE802154; /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); - if (sk->sk_prot->hash) - sk->sk_prot->hash(sk); + if (sk->sk_prot->hash) { + rc = sk->sk_prot->hash(sk); + if (rc) { + sk_common_release(sk); + goto out; + } + } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 395d82754626..daad05c6f25b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -298,6 +298,7 @@ config SYN_COOKIES config NET_IPVTI tristate "Virtual (secure) IP: tunneling" + depends on IPV6 || IPV6=n select INET_TUNNEL select NET_IP_TUNNEL depends on INET_XFRM_MODE_TUNNEL diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 55eff963d1fe..a35c252235ae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -89,7 +89,6 @@ #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/slab.h> -#include <linux/netfilter/xt_qtaguid.h> #include <asm/uaccess.h> @@ -388,13 +387,27 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) + if (err) { sk_common_release(sk); + goto out; + } + } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { + sk_common_release(sk); + goto out; + } } out: return err; @@ -416,9 +429,6 @@ int inet_release(struct socket *sock) if (sk) { long timeout; -#ifdef CONFIG_NETFILTER_XT_MATCH_QTAGUID - qtaguid_untag(sock, true); -#endif /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -1163,8 +1173,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __sk_prot_rehash(sk); - return 0; + return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) @@ -1802,6 +1811,10 @@ static int __init inet_init(void) tcp_v4_init(); + /* Initialise per-cpu ipv4 mibs */ + if (init_ipv4_mibs()) + panic("%s: Cannot init ipv4 mibs\n", __func__); + /* Setup TCP slab cache for open requests. */ tcp_init(); @@ -1830,12 +1843,6 @@ static int __init inet_init(void) if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); - /* - * Initialise per-cpu ipv4 mibs - */ - - if (init_ipv4_mibs()) - pr_crit("%s: Cannot init ipv4 mibs\n", __func__); ipv4_proc_init(); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index bfa79831873f..f1e30ff3cfd7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -743,6 +743,14 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) goto out; /* + * For some 802.11 wireless deployments (and possibly other networks), + * there will be an ARP proxy and gratuitous ARP frames are attacks + * and thus should not be accepted. + */ + if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) + goto out; + +/* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 7fe643062013..e798e27b3c7d 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -557,6 +557,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); + kfree(doi_def->map.std); break; } kfree(doi_def); @@ -1343,7 +1344,8 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, return ret_val; } - secattr->flags |= NETLBL_SECATTR_MLS_CAT; + if (secattr->attr.mls.cat) + secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; @@ -1524,7 +1526,8 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, return ret_val; } - secattr->flags |= NETLBL_SECATTR_MLS_CAT; + if (secattr->attr.mls.cat) + secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; @@ -1809,6 +1812,7 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; + int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; @@ -1820,7 +1824,11 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) memset(opt, 0, sizeof(struct ip_options)); opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); - if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL)) + rcu_read_lock(); + res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); + rcu_read_unlock(); + + if (res) return; if (gateway) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e9df88f8579c..7accf1ebe947 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -262,6 +262,7 @@ static struct in_device *inetdev_init(struct net_device *dev) err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; + neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; @@ -560,12 +561,15 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } -static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) +static int ip_mc_autojoin_config(struct net *net, bool join, + const struct in_ifaddr *ifa) { +#if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; + struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL(); @@ -578,6 +582,9 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) release_sock(sk); return ret; +#else + return -EOPNOTSUPP; +#endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -617,7 +624,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) continue; if (ipv4_is_multicast(ifa->ifa_address)) - ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa); + ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); return 0; } @@ -873,8 +880,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { - int ret = ip_mc_config(net->ipv4.mc_autojoin_sk, - true, ifa); + int ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { inet_free_ifa(ifa); @@ -1364,11 +1370,6 @@ skip: } } -static bool inetdev_valid_mtu(unsigned int mtu) -{ - return mtu >= IPV4_MIN_MTU; -} - static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) @@ -1814,7 +1815,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; @@ -1826,7 +1827,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: if (err < 0) @@ -1883,7 +1884,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_KERNEL); if (!skb) goto errout; @@ -2007,16 +2008,16 @@ static void inet_forward_change(struct net *net) for_each_netdev(net, dev) { struct in_device *in_dev; + if (on) dev_disable_lro(dev); - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); + + in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } - rcu_read_unlock(); } } @@ -2197,6 +2198,8 @@ static struct devinet_sysctl_table { "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), + DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, + "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2204,6 +2207,8 @@ static struct devinet_sysctl_table { "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), + DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, + "drop_unicast_in_l2_multicast"), DEVINET_SYSCTL_RW_ENTRY(NF_IPV4_DEFRAG_SKIP, "nf_ipv4_defrag_skip"), }, @@ -2238,7 +2243,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, free: kfree(t); out: - return -ENOBUFS; + return -ENOMEM; } static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8dc9073d4a76..c01149331f46 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -299,7 +299,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; + cfg->fc_table = l3mdev_fib_table(dev); if (colon) { struct in_ifaddr *ifa; struct in_device *in_dev = __in_dev_get_rtnl(dev); @@ -1035,7 +1036,7 @@ no_promotions: * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ - if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) + if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3109b9bb95d2..3c1e42d49520 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -776,7 +776,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - if (cfg->fc_table) + if (cfg->fc_table && cfg->fc_table != RT_TABLE_MAIN) tbl = fib_get_table(net, cfg->fc_table); if (tbl) @@ -1069,6 +1069,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; + fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; change_nexthops(fi) { @@ -1352,18 +1353,21 @@ nla_put_failure: * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ -int fib_sync_down_addr(struct net *net, __be32 local) +int fib_sync_down_addr(struct net_device *dev, __be32 local) { int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; + struct net *net = dev_net(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) return 0; hlist_for_each_entry(fi, head, fib_lhash) { - if (!net_eq(fi->fib_net, net)) + if (!net_eq(fi->fib_net, net) || + fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9b14f8958dcc..e3fdd7ac20bf 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1696,7 +1696,7 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp = NULL; - hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) @@ -1714,8 +1714,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, - NULL, l->key)) + NULL, l->key)) { + kmem_cache_free(fn_alias_kmem, new_fa); goto out; + } } /* stop loop if key wrapped back to 0 */ @@ -2230,6 +2232,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) " %Zd bytes, size of tnode: %Zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); + rcu_read_lock(); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; @@ -2249,7 +2252,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) trie_show_usage(seq, t->stats); #endif } + cond_resched_rcu(); } + rcu_read_unlock(); return 0; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index b5a137338e50..7ac370505e44 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -205,6 +205,9 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head, */ NAPI_GRO_CB(skb)->encap_mark = 0; + /* Flag this frame as already having an outer encap header */ + NAPI_GRO_CB(skb)->is_fou = 1; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); @@ -372,6 +375,9 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, */ NAPI_GRO_CB(skb)->encap_mark = 0; + /* Flag this frame as already having an outer encap header */ + NAPI_GRO_CB(skb)->is_fou = 1; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[guehdr->proto_ctype]); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 79ae0d7becbf..d9268af2ea44 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -151,6 +151,14 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) goto out; + /* We can only support GRE_CSUM if we can track the location of + * the GRE header. In the case of FOU/GUE we cannot because the + * outer UDP header displaces the GRE header leaving us in a state + * of limbo. + */ + if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou) + goto out; + type = greh->protocol; rcu_read_lock(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a51f0dd6a49e..2c14d607a683 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -246,7 +246,7 @@ static struct { /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * - * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. * Note: called with BH disabled */ @@ -256,10 +256,11 @@ bool icmp_global_allow(void) bool rc = false; /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. + * without taking the spinlock. The READ_ONCE() are paired + * with the following WRITE_ONCE() in this same function. */ - if (!icmp_global.credit) { - delta = min_t(u32, now - icmp_global.stamp, HZ); + if (!READ_ONCE(icmp_global.credit)) { + delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); if (delta < HZ / 50) return false; } @@ -269,14 +270,17 @@ bool icmp_global_allow(void) if (delta >= HZ / 50) { incr = sysctl_icmp_msgs_per_sec * delta / HZ ; if (incr) - icmp_global.stamp = now; + WRITE_ONCE(icmp_global.stamp, now); } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { - credit--; + /* We want to use a credit of one in average, but need to randomize + * it for security reasons. + */ + credit = max_t(int, credit - prandom_u32_max(3), 0); rc = true; } - icmp_global.credit = credit; + WRITE_ONCE(icmp_global.credit, credit); spin_unlock(&icmp_global.lock); return rc; } @@ -429,7 +433,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; @@ -457,6 +461,23 @@ static int icmp_multipath_hash_skb(const struct sk_buff *skb) #endif +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -465,6 +486,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -479,9 +501,10 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); - security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); + security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4)); rt = __ip_route_output_key_hash(net, fl4, icmp_multipath_hash_skb(skb_in)); if (IS_ERR(rt)) @@ -504,7 +527,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_in->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c67efa3e79dd..7b0bbda676b3 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2631,6 +2631,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { + spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; @@ -2641,6 +2642,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; + spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0b6a72428bc2..fa9df2e6d330 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/sock_reuseport.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || - (sk2->sk_state != TCP_TIME_WAIT && + rcu_access_pointer(sk->sk_reuseport_cb) || + (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || @@ -87,6 +89,31 @@ int inet_csk_bind_conflict(const struct sock *sk, } EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); +void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, + struct sock *sk) +{ + kuid_t uid = sock_i_uid(sk); + + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + if (sk->sk_reuseport) { + tb->fastreuseport = 1; + tb->fastuid = uid; + } else + tb->fastreuseport = 0; + } else { + if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; + if (tb->fastreuseport && + (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + tb->fastreuseport = 0; + } +} + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ @@ -132,6 +159,7 @@ again: sk->sk_state != TCP_LISTEN) || (tb->fastreuseport > 0 && sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(tb->fastuid, uid))) && (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; @@ -200,15 +228,18 @@ tb_found: if (((tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN) || (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) { + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && smallest_size == -1) { goto success; } else { ret = 1; if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && smallest_size != -1 && --attempts >= 0) { spin_unlock(&head->lock); goto again; @@ -223,24 +254,9 @@ tb_not_found: if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, net, head, snum)) == NULL) goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - if (sk->sk_reuseport) { - tb->fastreuseport = 1; - tb->fastuid = uid; - } else - tb->fastreuseport = 0; - } else { - if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - if (tb->fastreuseport && - (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) - tb->fastreuseport = 0; - } + + inet_csk_update_fastreuse(tb, sk); + success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); @@ -430,7 +446,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; @@ -466,7 +482,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; @@ -745,6 +761,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + int err = -EADDRINUSE; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -762,13 +779,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog) inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); - return 0; + if (likely(!err)) + return 0; } sk->sk_state = TCP_CLOSE; - return -EADDRINUSE; + return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index fcb83b2a61f0..1582fe5b04c3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -370,18 +370,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *sk; if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, + sk = inet6_lookup(net, hashinfo, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccc5980797fc..b947e4c9be18 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@ #include <linux/wait.h> #include <linux/vmalloc.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -160,6 +162,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOMEM; } } + inet_csk_update_fastreuse(tb, child); } inet_bind_hash(child, tb, port); spin_unlock(&head->lock); @@ -205,6 +208,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) @@ -214,6 +218,7 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; rcu_read_lock(); @@ -229,6 +234,15 @@ begin: if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -246,11 +260,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -449,32 +465,73 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, + struct inet_listen_hashbucket *ilb, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; + struct sock *sk2; + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + + sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + if (sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + inet_csk(sk2)->icsk_bind_hash == tb && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + saddr_same(sk, sk2, false)) + return reuseport_add_sock(sk, sk2); + } + + return reuseport_alloc(sk); +} + +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; + int err = 0; if (sk->sk_state != TCP_LISTEN) { inet_ehash_nolisten(sk, osk); - return; + return 0; } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; spin_lock(&ilb->lock); + if (sk->sk_reuseport) { + err = inet_reuseport_add_sock(sk, ilb, saddr_same); + if (err) + goto unlock; + } __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: spin_unlock(&ilb->lock); + + return err; } EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { + int err = 0; + if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk, NULL); + err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); local_bh_enable(); } + + return err; } EXPORT_SYMBOL_GPL(inet_hash); @@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 3e4184088082..63f7bacf628a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -399,7 +399,10 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) iph->saddr, iph->daddr, tpi->key); if (tunnel) { - skb_pop_mac_header(skb); + if (tunnel->dev->type != ARPHRD_NONE) + skb_pop_mac_header(skb); + else + skb_reset_mac_header(skb); if (tunnel->collect_md) { __be16 flags; __be64 tun_id; @@ -499,6 +502,10 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool csum) { + unsigned char *skb_checksum_start = skb->head + skb->csum_start; + + if (csum && skb_checksum_start < skb->data) + return ERR_PTR(-EINVAL); return iptunnel_handle_offloads(skb, csum, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } @@ -520,7 +527,8 @@ static struct rtable *gre_get_rt(struct sk_buff *skb, return ip_route_output_key(net, fl); } -static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -563,7 +571,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) } flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); - build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), + build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -605,7 +613,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, const struct iphdr *tnl_params; if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; } @@ -649,7 +657,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; } @@ -851,9 +859,16 @@ static void __gre_tunnel_init(struct net_device *dev) dev->hw_features |= GRE_FEATURES; if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported. */ - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || + (tunnel->encap.type == TUNNEL_ENCAP_NONE)) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + /* Can use a lockless transmit, unless we generate * output sequences */ @@ -875,7 +890,7 @@ static int ipgre_tunnel_init(struct net_device *dev) netif_keep_dst(dev); dev->addr_len = 4; - if (iph->daddr) { + if (iph->daddr && !tunnel->collect_md) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) @@ -884,8 +899,9 @@ static int ipgre_tunnel_init(struct net_device *dev) dev->header_ops = &ipgre_header_ops; } #endif - } else + } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + } return ip_tunnel_init(dev); } @@ -928,6 +944,11 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) if (flags & (GRE_VERSION|GRE_ROUTING)) return -EINVAL; + if (data[IFLA_GRE_COLLECT_METADATA] && + data[IFLA_GRE_ENCAP_TYPE] && + nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) + return -EINVAL; + return 0; } @@ -1001,6 +1022,8 @@ static void ipgre_netlink_parms(struct net_device *dev, struct ip_tunnel *t = netdev_priv(dev); t->collect_md = true; + if (dev->type == ARPHRD_IPGRE) + dev->type = ARPHRD_NONE; } } @@ -1230,6 +1253,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; + LIST_HEAD(list_kill); struct ip_tunnel *t; int err; @@ -1245,8 +1269,10 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, t->collect_md = true; err = ipgre_newlink(net, dev, tb, NULL); - if (err < 0) - goto out; + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } /* openvswitch users expect packet sizes to be unrestricted, * so set the largest MTU we can. @@ -1255,9 +1281,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, if (err) goto out; + err = rtnl_configure_link(dev, NULL); + if (err < 0) + goto out; + return dev; out: - free_netdev(dev); + ip_tunnel_dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(gretap_fb_dev_create); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index eb1834f2682f..317c77eae4a3 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -359,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); - } else if (rt->rt_type == RTN_BROADCAST) + } else if (rt->rt_type == RTN_BROADCAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); + } else if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + /* RFC 1122 3.3.6: + * + * When a host sends a datagram to a link-layer broadcast + * address, the IP destination address MUST be a legal IP + * broadcast or IP multicast address. + * + * A host SHOULD silently discard a datagram that is received + * via a link-layer broadcast (see Section 2.4) but does not + * specify an IP multicast or broadcast destination address. + * + * This doesn't explicitly say L2 *broadcast*, but broadcast is + * in a way a form of multicast and the most common use case for + * this is 802.11 protecting against cross-station spoofing (the + * so-called "hole-196" attack) so do it for both. + */ + if (in_dev && + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) + goto drop; + } return dst_input(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c2380bb1fdab..a0739ef47227 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -73,6 +73,8 @@ #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> +#include <net/inet_ecn.h> +#include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> @@ -154,12 +156,19 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - if (ip_dont_fragment(sk, &rt->dst)) { + /* Do not bother generating IPID for small packets (eg SYNACK) */ + if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; - __ip_select_ident(net, iph, 1); + /* TCP packets here are SYNACK with fat IPv4/TCP options. + * Avoid using the hashed IP ident generator. + */ + if (sk->sk_protocol == IPPROTO_TCP) + iph->id = (__force __be16)prandom_u32(); + else + __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { @@ -270,6 +279,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -282,12 +298,26 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); - if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); return ip_finish_output2(net, sk, skb); } +static int ip_mc_finish_output(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + + return dev_loopback_xmit(net, sk, skb); +} + int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); @@ -325,7 +355,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -341,7 +371,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, @@ -375,8 +405,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); - memcpy(&iph->saddr, &fl4->saddr, - sizeof(fl4->saddr) + sizeof(fl4->daddr)); + + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ @@ -1145,13 +1176,17 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, rt = *rtp; if (unlikely(!rt)) return -EFAULT; - /* - * We steal reference to this route, caller should not release it - */ - *rtp = NULL; + cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : rt->dst.dev->mtu; + dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + + if (!inetdev_valid_mtu(cork->fragsize)) + return -ENETUNREACH; + cork->dst = &rt->dst; + /* We stole this route, caller should not release it. */ + *rtp = NULL; + cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; @@ -1589,12 +1624,12 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest, arg->uid); - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return; - inet_sk(sk)->tos = arg->tos; + inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 1ea36bf778e6..9a7b60d6c670 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -279,9 +279,12 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, ipc->ttl = val; break; case IP_TOS: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + if (cmsg->cmsg_len == CMSG_LEN(sizeof(int))) + val = *(int *)CMSG_DATA(cmsg); + else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8))) + val = *(u8 *)CMSG_DATA(cmsg); + else return -EINVAL; - val = *(int *)CMSG_DATA(cmsg); if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 91ae061d46ac..dc92780f9e8c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -98,9 +98,10 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, __be32 remote, __be32 local, __be32 key) { - unsigned int hash; struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; + struct net_device *ndev; + unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; @@ -155,11 +156,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, cand = t; } - if (flags & TUNNEL_NO_KEY) - goto skip_key_lookup; - hlist_for_each_entry_rcu(t, head, hash_node) { - if (t->parms.i_key != key || + if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || t->parms.iph.saddr != 0 || t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) @@ -171,7 +169,6 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, cand = t; } -skip_key_lookup: if (cand) return cand; @@ -179,8 +176,9 @@ skip_key_lookup: if (t) return t; - if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(itn->fb_tunnel_dev); + ndev = READ_ONCE(itn->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -710,7 +708,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) { + df = tnl_params->frag_off; + if (skb->protocol == htons(ETH_P_IP)) + df |= (inner_iph->frag_off & htons(IP_DF)); + + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) { ip_rt_put(rt); goto tx_error; } @@ -738,10 +740,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ttl = ip4_dst_hoplimit(&rt->dst); } - df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP)) - df |= (inner_iph->frag_off&htons(IP_DF)); - max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > dev->needed_headroom) @@ -1151,10 +1149,8 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; - if (tunnel->collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; + if (tunnel->collect_md) netif_keep_dst(dev); - } return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); @@ -1166,9 +1162,9 @@ void ip_tunnel_uninit(struct net_device *dev) struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); - /* fb_tunnel_dev will be unregisted in net-exit call. */ - if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(itn, netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); + if (itn->fb_tunnel_dev == dev) + WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 4916d1857b75..4d64aa76d285 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -407,6 +407,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { void __init ip_tunnel_core_init(void) { + /* If you land here, make sure whether increasing ip_tunnel_info's + * options_len is a reasonable choice with its usage in front ends + * (f.e., it's part of flow keys, etc). + */ + BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255); + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index bbcbbc1cc2cc..abcf431376a0 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -51,7 +51,7 @@ static int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) + int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); @@ -66,6 +66,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + if (update_skb_dev) + skb->dev = tunnel->dev; + return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -75,25 +78,43 @@ drop: return 0; } -static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) +static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { - struct ip_tunnel *tunnel; + return vti_input(skb, nexthdr, spi, encap_type, false); +} + +static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); +} + +static int vti_rcv_proto(struct sk_buff *skb) +{ + return vti_rcv(skb, 0, false); +} + +static int vti_rcv_tunnel(struct sk_buff *skb) +{ + struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id); const struct iphdr *iph = ip_hdr(skb); - struct net *net = dev_net(skb->dev); - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + struct ip_tunnel *tunnel; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { + struct tnl_ptk_info tpi = { + .proto = htons(ETH_P_IP), + }; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - - skb->dev = tunnel->dev; - - return xfrm_input(skb, nexthdr, spi, encap_type); + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false); } return -EINVAL; @@ -102,22 +123,6 @@ drop: return 0; } -static int vti_rcv(struct sk_buff *skb) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - - return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); -} - -static int vti_rcv_ipip(struct sk_buff *skb) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - - return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); -} - static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -195,8 +200,39 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, int err; if (!dst) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + dst = &rt->dst; + skb_dst_set(skb, dst); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + skb_dst_set(skb, dst); + break; +#endif + default: + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } } dst_hold(dst); @@ -421,31 +457,31 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm_tunnel ipip_handler __read_mostly = { - .handler = vti_rcv_ipip, + .handler = vti_rcv_tunnel, .err_handler = vti4_err, .priority = 0, }; @@ -635,10 +671,8 @@ static int __init vti_init(void) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); - if (err < 0) { - pr_info("%s: cant't register tunnel\n",__func__); + if (err < 0) goto xfrm_tunnel_failed; - } msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 60f564db25a3..173777aa5add 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -890,7 +890,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d /* - * Copy BOOTP-supplied string if not already set. + * Copy BOOTP-supplied string */ static int __init ic_bootp_string(char *dest, char *src, int len, int max) { @@ -941,12 +941,15 @@ static void __init ic_do_bootp_ext(u8 *ext) } break; case 12: /* Host name */ - ic_bootp_string(utsname()->nodename, ext+1, *ext, - __NEW_UTS_LEN); - ic_host_name_set = 1; + if (!ic_host_name_set) { + ic_bootp_string(utsname()->nodename, ext+1, *ext, + __NEW_UTS_LEN); + ic_host_name_set = 1; + } break; case 15: /* Domain name (DNS) */ - ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + if (!ic_domain[0]) + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); break; case 17: /* Root path */ if (!root_server_path[0]) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index cbe630aab44a..ea164fd61a7c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -488,11 +488,12 @@ next: return 1; } -static inline int check_target(struct arpt_entry *e, const char *name) +static int check_target(struct arpt_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = arpt_get_target(e); int ret; struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -510,8 +511,9 @@ static inline int check_target(struct arpt_entry *e, const char *name) return 0; } -static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, +static int +find_check_entry(struct arpt_entry *e, struct net *net, const char *name, + unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; @@ -531,7 +533,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; return 0; @@ -632,7 +634,9 @@ static inline void cleanup_entry(struct arpt_entry *e) /* Checks and translates the user-supplied table segment (held in * newinfo). */ -static int translate_table(struct xt_table_info *newinfo, void *entry0, +static int translate_table(struct net *net, + struct xt_table_info *newinfo, + void *entry0, const struct arpt_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; @@ -709,7 +713,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, repl->name, repl->size, + ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; @@ -1114,7 +1118,7 @@ static int do_replace(struct net *net, const void __user *user, goto free_newinfo; } - ret = translate_table(newinfo, loc_cpu_entry, &tmp); + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1301,7 +1305,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, } } -static int translate_compat_table(struct xt_table_info **pinfo, +static int translate_compat_table(struct net *net, + struct xt_table_info **pinfo, void **pentry0, const struct compat_arpt_replace *compatr) { @@ -1344,6 +1349,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; @@ -1371,7 +1378,7 @@ static int translate_compat_table(struct xt_table_info **pinfo, repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; - ret = translate_table(newinfo, entry1, &repl); + ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; @@ -1426,7 +1433,7 @@ static int compat_do_replace(struct net *net, void __user *user, goto free_newinfo; } - ret = translate_compat_table(&newinfo, &loc_cpu_entry, &tmp); + ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1696,7 +1703,7 @@ struct xt_table *arpt_register_table(struct net *net, loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(newinfo, loc_cpu_entry, repl); + ret = translate_table(net, newinfo, loc_cpu_entry, repl); duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) goto out_free; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 53d664a7774c..73b1d8e64658 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -964,10 +964,6 @@ copy_entries_to_user(unsigned int total_size, return PTR_ERR(counters); loc_cpu_entry = private->entries; - if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { - ret = -EFAULT; - goto free_counters; - } /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ @@ -977,6 +973,10 @@ copy_entries_to_user(unsigned int total_size, const struct xt_entry_target *t; e = (struct ipt_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off, e, sizeof(*e))) { + ret = -EFAULT; + goto free_counters; + } if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -990,23 +990,14 @@ copy_entries_to_user(unsigned int total_size, i += m->u.match_size) { m = (void *)e + i; - if (copy_to_user(userptr + off + i - + offsetof(struct xt_entry_match, - u.user.name), - m->u.kernel.match->name, - strlen(m->u.kernel.match->name)+1) - != 0) { + if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ipt_get_target_c(e); - if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct xt_entry_target, - u.user.name), - t->u.kernel.target->name, - strlen(t->u.kernel.target->name)+1) != 0) { + if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } @@ -1610,6 +1601,8 @@ translate_compat_table(struct net *net, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 16599bae11dd..28bcde0a2749 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -478,6 +478,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = { .checkentry = clusterip_tg_check, .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), + .usersize = offsetof(struct ipt_clusterip_tgt_info, config), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), #endif /* CONFIG_COMPAT */ diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 78cc64eddfc1..32a363465e0a 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -92,7 +92,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_oif = 0; flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = RT_TOS(iph->tos); + flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index b3ca21b2ba9b..ddbf93e70069 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -156,8 +156,7 @@ pptp_outbound_pkt(struct sk_buff *skb, break; default: pr_debug("unknown outbound packet 0x%04x:%s\n", msg, - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : - pptp_msg_name[0]); + pptp_msg_name(msg)); /* fall through */ case PPTP_SET_LINK_INFO: /* only need to NAT in case PAC is behind NAT box */ @@ -250,9 +249,7 @@ pptp_inbound_pkt(struct sk_buff *skb, pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID); break; default: - pr_debug("unknown inbound packet %s\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : - pptp_msg_name[0]); + pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg)); /* fall through */ case PPTP_START_SESSION_REQUEST: case PPTP_START_SESSION_REPLY: diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index bf855e64fc45..0c01a270bf9f 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -28,7 +28,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, struct in_addr gw = { .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); } @@ -59,7 +59,9 @@ static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c0b633ee6c1e..cc04672d4d8c 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -145,10 +145,12 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -void ping_hash(struct sock *sk) +int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ + + return 0; } void ping_unhash(struct sock *sk) @@ -803,7 +805,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + fl4.fl4_icmp_type = user_icmph.type; + fl4.fl4_icmp_code = user_icmph.code; + + security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -976,6 +981,7 @@ bool ping_rcv(struct sk_buff *skb) struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + bool rc = false; /* We assume the packet has already been checked by icmp_rcv */ @@ -990,14 +996,15 @@ bool ping_rcv(struct sk_buff *skb) struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); pr_debug("rcv on socket %p\n", sk); - if (skb2) - ping_queue_rcv_skb(sk, skb2); + if (skb2 && !ping_queue_rcv_skb(sk, skb2)) + rc = true; sock_put(sk); - return true; } - pr_debug("no socket, dropping\n"); - return false; + if (!rc) + pr_debug("no socket, dropping\n"); + + return rc; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 01f8d24f3af7..fc2fb170171b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; -void raw_hash_sk(struct sock *sk) +int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; @@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk) sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); + + return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); @@ -507,9 +509,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. */ hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); /* * Check the flags. */ @@ -626,7 +630,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto done; } - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -707,6 +711,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int ret = -EINVAL; int chk_addr_ret; + lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); @@ -719,7 +724,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; -out: return ret; +out: + release_sock(sk); + return ret; } /* diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6e17149b0983..f79f9a6dd046 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -70,6 +70,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -131,8 +132,6 @@ static int ip_rt_min_advmss __read_mostly = 256; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; -static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; - /* * Interface to generic destination cache. */ @@ -273,6 +272,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } + (*pos)++; return NULL; } @@ -464,8 +464,10 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } -#define IP_IDENTS_SZ 2048u - +/* Hash tables of size 2048..262144 depending on RAM size. + * Each bucket uses 8 bytes. + */ +static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; @@ -475,15 +477,23 @@ static u32 *ip_tstamps __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; - atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); - u32 now = (u32)jiffies; + u32 bucket, old, now = (u32)jiffies; + atomic_t *p_id; + u32 *p_tstamp; u32 delta = 0; + bucket = hash & ip_idents_mask; + p_tstamp = ip_tstamps + bucket; + p_id = ip_idents + bucket; + old = ACCESS_ONCE(*p_tstamp); + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); + /* If UBSAN reports an error there, please make sure your compiler + * supports -fno-strict-overflow before reporting it that was a bug + * in UBSAN, and it has been fixed in GCC-8. + */ return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); @@ -590,18 +600,25 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) } } -static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { - struct fib_nh_exception *fnhe, *oldest; + struct fib_nh_exception __rcu **fnhe_p, **oldest_p; + struct fib_nh_exception *fnhe, *oldest = NULL; - oldest = rcu_dereference(hash->chain); - for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; - fnhe = rcu_dereference(fnhe->fnhe_next)) { - if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { + fnhe = rcu_dereference_protected(*fnhe_p, + lockdep_is_held(&fnhe_lock)); + if (!fnhe) + break; + if (!oldest || + time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; + oldest_p = fnhe_p; + } } fnhe_flush_routes(oldest); - return oldest; + *oldest_p = oldest->fnhe_next; + kfree_rcu(oldest, rcu); } static inline u32 fnhe_hashfun(__be32 daddr) @@ -678,16 +695,21 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, if (rt) fill_route_from_fnhe(rt, fnhe); } else { - if (depth > FNHE_RECLAIM_DEPTH) - fnhe = fnhe_oldest(hash); - else { - fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); - if (!fnhe) - goto out_unlock; - - fnhe->fnhe_next = hash->chain; - rcu_assign_pointer(hash->chain, fnhe); + /* Randomize max depth to avoid some side channels attacks. */ + int max_depth = FNHE_RECLAIM_DEPTH + + prandom_u32_max(FNHE_RECLAIM_DEPTH); + + while (depth > max_depth) { + fnhe_remove_oldest(hash); + depth--; } + + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; @@ -695,6 +717,8 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = expires; + rcu_assign_pointer(hash->chain, fnhe); + /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. @@ -898,7 +922,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (peer->rate_tokens == 0 || + if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { @@ -989,21 +1013,22 @@ out: kfree_skb(skb); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; + u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; if (ip_mtu_locked(dst)) return; - if (ipv4_mtu(dst) < mtu) + if (old_mtu < mtu) return; if (mtu < ip_rt_min_pmtu) { lock = true; - mtu = ip_rt_min_pmtu; + mtu = min(old_mtu, ip_rt_min_pmtu); } - if (rt->rt_pmtu == mtu && + if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) return; @@ -1501,9 +1526,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #endif } -static struct rtable *rt_dst_alloc(struct net_device *dev, - unsigned int flags, u16 type, - bool nopolicy, bool noxfrm, bool will_cache) +struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool will_cache) { struct rtable *rt; @@ -1532,6 +1557,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, return rt; } +EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2721,6 +2747,7 @@ void ip_rt_multicast_event(struct in_device *in_dev) static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; +static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, void __user *buffer, @@ -2947,18 +2974,27 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { + void *idents_hash; int rc = 0; int cpu; - ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); - if (!ip_idents) - panic("IP: failed to allocate ip_idents\n"); + /* For modern hosts, this will use 2 MB of memory */ + idents_hash = alloc_large_system_hash("IP idents", + sizeof(*ip_idents) + sizeof(*ip_tstamps), + 0, + 16, /* one bucket per 64 KB */ + 0, + NULL, + &ip_idents_mask, + 2048, + 256*1024); + + ip_idents = idents_hash; - prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); - ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); - if (!ip_tstamps) - panic("IP: failed to allocate ip_tstamps\n"); + ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); + memset(ip_tstamps, 0, (ip_idents_mask + 1) * sizeof(*ip_tstamps)); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4cebe913a0b3..6acc73d3c83d 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -307,7 +307,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct rtable *rt; __u8 rcv_wscale; struct flowi4 fl4; @@ -382,7 +382,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { reqsk_free(req); @@ -391,8 +391,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index abae27d60f67..f5cfc467c920 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2268,8 +2268,12 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); + tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() @@ -2281,6 +2285,8 @@ int tcp_disconnect(struct sock *sk, int flags) dst_release(sk->sk_rx_dst); sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); + tp->segs_in = 0; + tp->segs_out = 0; tp->bytes_acked = 0; tp->bytes_received = 0; @@ -2599,10 +2605,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: - if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) - err = tp->af_specific->md5_parse(sk, optval, optlen); - else - err = -EINVAL; + err = tp->af_specific->md5_parse(sk, optval, optlen); break; #endif case TCP_USER_TIMEOUT: @@ -3099,9 +3102,12 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { + u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; - sg_init_one(&sg, key->key, key->keylen); + sg_init_one(&sg, key->key, keylen); + + /* tcp_md5_do_add() might change key->key under us */ return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index aafe68134763..f6b64efc22e8 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -201,7 +201,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE) + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 448c2615fece..e0b3b194b604 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -342,8 +342,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; if (tcp_in_slow_start(tp)) { - if (hystart && after(ack, ca->end_seq)) - bictcp_hystart_reset(sk); acked = tcp_slow_start(tp, acked); if (!acked) return; @@ -394,6 +392,9 @@ static void hystart_update(struct sock *sk, u32 delay) if (ca->found & hystart_detect) return; + if (after(tp->snd_una, ca->end_seq)) + bictcp_hystart_reset(sk); + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); @@ -414,6 +415,8 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ + if (ca->curr_rtt > delay) + ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { if (ca->curr_rtt == 0 || ca->curr_rtt > delay) ca->curr_rtt = delay; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2f4896377fbc..14ed25f50ed5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -896,9 +896,10 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This must be called before lost_out is incremented */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { - if (!tp->retransmit_skb_hint || - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || + (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq))) tp->retransmit_skb_hint = skb; if (!tp->lost_out || @@ -1686,8 +1687,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } /* Ignore very old stuff early */ - if (!after(sp[used_sacks].end_seq, prior_snd_una)) + if (!after(sp[used_sacks].end_seq, prior_snd_una)) { + if (i == 0) + first_sack_index = -1; continue; + } used_sacks++; } @@ -2923,7 +2927,10 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + struct rtt_meas rttm = { + .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), + .ts = now, + }; u32 elapsed; /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ @@ -3511,10 +3518,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -3523,7 +3528,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (before(ack, tp->tlp_high_seq)) return; - if (flag & FLAG_DSACKING_ACK) { + if (!tp->tlp_retrans) { + /* TLP of new data has been acknowledged */ + tp->tlp_high_seq = 0; + } else if (flag & FLAG_DSACKING_ACK) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { @@ -4459,7 +4467,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4539,7 +4551,11 @@ add_sack: tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); skb_set_owner_r(skb, sk); } } @@ -5517,6 +5533,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; + } else { + tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); } __tcp_ack_snd_check(sk, 0); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1763e02103a3..4a9d8c117794 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -277,7 +277,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; - mtu = tcp_sk(sk)->mtu_info; + mtu = READ_ONCE(tcp_sk(sk)->mtu_info); dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -444,7 +444,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (sk->sk_state == TCP_LISTEN) goto out; - tp->mtu_info = info; + WRITE_ONCE(tp->mtu_info, info); if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); } else { @@ -643,8 +643,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, - &tcp_hashinfo, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, + ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ @@ -935,9 +935,18 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key = tcp_md5_do_lookup(sk, addr, family); if (key) { - /* Pre-existing entry - just update that one. */ + /* Pre-existing entry - just update that one. + * Note that the key might be used concurrently. + */ memcpy(key->key, newkey, newkeylen); - key->keylen = newkeylen; + + /* Pairs with READ_ONCE() in tcp_md5_hash_key(). + * Also note that a reader could catch new key->keylen value + * but old key->key[], this is the reason we use __GFP_ZERO + * at sock_kmalloc() time below these lines. + */ + WRITE_ONCE(key->keylen, newkeylen); + return 0; } @@ -954,7 +963,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, rcu_assign_pointer(tp->md5sig_info, md5sig); } - key = sock_kmalloc(sk, sizeof(*key), gfp); + key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); if (!key) return -ENOMEM; if (!tcp_alloc_md5sig_pool()) { @@ -1622,7 +1631,8 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->sacked = 0; lookup: - sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + th->dest); if (!sk) goto no_tcp_socket; @@ -1745,7 +1755,8 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), - &tcp_hashinfo, + &tcp_hashinfo, skb, + __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -2035,6 +2046,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2045,7 +2057,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_next(seq, NULL); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2056,7 +2068,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e65c211d3f4b..17388adedc7d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -710,8 +710,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + if (likely(opts->num_sack_blocks)) + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } return size; @@ -1352,6 +1353,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } +EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) @@ -1500,7 +1502,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * window, and remember whether we were cwnd-limited then. */ if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out) { + tp->packets_out > tp->max_packets_out || + is_cwnd_limited) { tp->max_packets_out = tp->packets_out; tp->max_packets_seq = tp->snd_nxt; tp->is_cwnd_limited = is_cwnd_limited; @@ -2147,6 +2150,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } + /* Argh, we hit an empty skb(), presumably a thread + * is sleeping in sendmsg()/sk_stream_wait_memory(). + * We do not want to send a pure-ack packet and have + * a strange looking rtx queue with empty packet(s). + */ + if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) + break; + if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2163,6 +2174,10 @@ repair: break; } + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + if (likely(sent_pkts || is_cwnd_limited)) + tcp_cwnd_validate(sk, is_cwnd_limited); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2170,8 +2185,6 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && tcp_send_head(sk); @@ -2262,6 +2275,11 @@ void tcp_send_loss_probe(struct sock *sk) int pcount; int mss = tcp_current_mss(sk); + /* At most one outstanding TLP */ + if (tp->tlp_high_seq) + goto rearm_timer; + + tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb) { if (tcp_snd_wnd_test(tp, skb, mss)) { @@ -2284,10 +2302,6 @@ void tcp_send_loss_probe(struct sock *sk) return; } - /* At most one outstanding TLP retransmission. */ - if (tp->tlp_high_seq) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2308,10 +2322,12 @@ void tcp_send_loss_probe(struct sock *sk) if (__tcp_retransmit_skb(sk, skb)) goto rearm_timer; + tp->tlp_retrans = 1; + +probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; -probe_sent: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 28fed4ade750..f336d87fe188 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -370,7 +370,7 @@ static void tcp_probe_timer(struct sock *sk) return; } - if (icsk->icsk_probes_out > max_probes) { + if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ce90a12b1e6e..1cf00a9d23f5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -113,6 +113,7 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" +#include <net/sock_reuseport.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -137,14 +138,14 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned long *bitmap, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int log) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -152,8 +153,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { if (!bitmap) return 1; __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); @@ -170,15 +172,15 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2, + bool match_wildcard)) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { + udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -186,8 +188,9 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { res = 1; break; } @@ -196,6 +199,31 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, return res; } +static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct net *net = sock_net(sk); + kuid_t uid = sock_i_uid(sk); + struct sock *sk2; + + sk_for_each(sk2, &hslot->head) { + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && + (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + (*saddr_same)(sk, sk2, false)) { + return reuseport_add_sock(sk, sk2); + } + } + + return reuseport_alloc(sk); +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * @@ -207,7 +235,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -295,17 +324,31 @@ found: udp_sk(sk)->udp_port_hash = snum; udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { - sk_nulls_add_node_rcu(sk, &hslot->head); + if (sk->sk_reuseport && + udp_reuseport_add_sock(sk, hslot, saddr_comp)) { + inet_sk(sk)->inet_num = 0; + udp_sk(sk)->udp_port_hash = 0; + udp_sk(sk)->udp_portaddr_hash ^= snum; + goto fail_unlock; + } + + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + else + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } + sock_set_flag(sk, SOCK_RCU_FREE); error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); @@ -314,13 +357,22 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 + */ +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - return (!ipv6_only_sock(sk2) && - (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || - inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); + if (!ipv6_only_sock(sk2)) { + if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr) + return 1; + if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr) + return match_wildcard; + } + return 0; } static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, @@ -446,28 +498,31 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2) + struct udp_hslot *hslot2, unsigned int slot2, + struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; -begin: result = NULL; badness = 0; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (result) + return result; matches = 1; } + badness = score; + result = sk; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -475,22 +530,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } @@ -499,17 +538,15 @@ begin: */ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -519,7 +556,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); if (!result) { hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; @@ -529,26 +566,29 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = 0; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -556,24 +596,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, saddr, hnum, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -586,15 +608,27 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - udptable); + udptable, skb); } +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + struct sock *sk; + + sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); +#endif static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, @@ -640,7 +674,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, udptable); + iph->saddr, uh->source, skb->dev->ifindex, udptable, + NULL); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -695,7 +730,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } void udp_err(struct sk_buff *skb, u32 info) @@ -1041,7 +1076,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -1411,13 +1446,15 @@ void udp_lib_unhash(struct sock *sk) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); - if (sk_nulls_del_node_init_rcu(sk)) { + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); } @@ -1438,22 +1475,28 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; - if (hslot2 != nhslot2) { + + if (hslot2 != nhslot2 || + rcu_access_pointer(sk->sk_reuseport_cb)) { hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); - - spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); - hslot2->count--; - spin_unlock(&hslot2->lock); - - spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &nhslot2->head); - nhslot2->count++; - spin_unlock(&nhslot2->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + } spin_unlock_bh(&hslot->lock); } @@ -1564,7 +1607,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { /* * MIB statistics other than incrementing the error count are @@ -1595,8 +1638,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (rcu_access_pointer(sk->sk_filter) && - udp_lib_checksum_complete(skb)) + if (udp_lib_checksum_complete(skb)) goto csum_error; if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { @@ -1628,35 +1670,6 @@ drop: return -1; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - unsigned int i; - struct sk_buff *skb1 = NULL; - struct sock *sk; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -1680,41 +1693,46 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct hlist_nulls_node *node; + struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = skb->dev->ifindex; - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + unsigned int offset = offsetof(typeof(*sk), sk_node); + int dif = skb->dev->ifindex; + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & - udp_table.mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; + udptable->mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udp_table.hash2[hash2]; + hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + + if (!first) { + first = sk; + continue; } - } + nskb = skb_clone(skb, GFP_ATOMIC); - spin_unlock(&hslot->lock); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; + } + if (udp_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -1722,16 +1740,13 @@ start_lookup: goto start_lookup; } - /* - * do the slow work with no lock held - */ - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udp_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -1760,8 +1775,11 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, } } - return skb_checksum_init_zero_check(skb, proto, uh->check, - inet_compute_pseudo); + /* Note, we are only interested in != 0 or == 0, thus the + * force to int. + */ + return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); } /* @@ -1833,7 +1851,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 @@ -1894,49 +1911,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, int dif) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); - unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); + unsigned int slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; - rcu_read_lock(); -begin: - count = 0; result = NULL; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) { + sk_for_each_rcu(sk, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, + rmt_port, rmt_addr, dif, hnum)) { + if (result) + return NULL; result = sk; - ++count; - } - } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (count != 1 || - unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!__udp_is_mcast_sock(net, result, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum))) { - sock_put(result); - result = NULL; } } - rcu_read_unlock(); + return result; } @@ -1949,37 +1941,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + struct sock *sk; - rcu_read_lock(); - result = NULL; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - if (INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, ports, dif)) - result = sk; + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + if (INET_MATCH(sk, net, acookie, rmt_addr, + loc_addr, ports, dif)) + return sk; /* Only check first socket in chain */ break; } - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, - ports, dif))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); - return result; + return NULL; } void udp_v4_early_demux(struct sk_buff *skb) @@ -1987,7 +1964,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct udphdr *uh; - struct sock *sk; + struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int ours; @@ -2019,11 +1996,9 @@ void udp_v4_early_demux(struct sk_buff *skb) } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); - } else { - return; } - if (!sk) + if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) return; skb->sk = sk; @@ -2316,7 +2291,6 @@ struct proto udp_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, @@ -2338,14 +2312,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) for (state->bucket = start; state->bucket <= state->udp_table->mask; ++state->bucket) { - struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -2364,7 +2337,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -2488,7 +2461,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_puts(seq, " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { @@ -2579,12 +2552,12 @@ void __init udp_table_init(struct udp_table *table, const char *name) table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + INIT_HLIST_HEAD(&table->hash2[i].head); table->hash2[i].count = 0; spin_lock_init(&table->hash2[i].lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 092aa60e8b92..9a89c10a55f0 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -36,15 +36,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { int err = -EINVAL; - struct sock *sk; + struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, @@ -52,11 +53,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #endif - else - goto out_nosk; - + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; @@ -97,25 +98,24 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { - int num, s_num, slot, s_slot; - struct net *net = sock_net(skb->sk); bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { - struct sock *sk; - struct hlist_nulls_node *node; struct udp_hslot *hslot = &table->hash[slot]; + struct sock *sk; num = 0; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) @@ -182,7 +182,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && @@ -190,7 +190,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); else sk = __udp6_lib_lookup(net, @@ -198,7 +198,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); } #endif else { diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6dfc3daf7c21..2a37f367dc04 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -300,7 +300,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, int flush = 1; if (NAPI_GRO_CB(skb)->encap_mark || - (skb->ip_summed != CHECKSUM_PARTIAL && + (uh->check && skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid)) goto out; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 78766b32b78b..705d9fbf0bcf 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -55,7 +55,6 @@ struct proto udplite_prot = { .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, .obj_size = sizeof(struct udp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 7ee6518afa86..73705a2368d9 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -75,9 +75,7 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -#ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; -#endif return xfrm_output(sk, skb); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e1ee6515cd4e..55c1cbfdf1ff 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -569,7 +569,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; @@ -581,7 +581,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err); @@ -800,7 +800,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) } if (p == &net->ipv6.devconf_all->forwarding) { + int old_dflt = net->ipv6.devconf_dflt->forwarding; + net->ipv6.devconf_dflt->forwarding = newf; + if ((!newf) ^ (!old_dflt)) + inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + addrconf_forward_change(net, newf); if ((!newf) ^ (!old)) inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, @@ -2295,6 +2302,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_dst_len = 8, .fc_flags = RTF_UP, .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, }; ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); @@ -2389,6 +2397,12 @@ static void manage_tempaddrs(struct inet6_dev *idev, } } +static bool is_addr_mode_generate_stable(struct inet6_dev *idev) +{ + return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY || + idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; +} + void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; @@ -2505,8 +2519,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) in6_dev->token.s6_addr + 8, 8); read_unlock_bh(&in6_dev->lock); tokenized = true; - } else if (in6_dev->addr_gen_mode == - IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + } else if (is_addr_mode_generate_stable(in6_dev) && !ipv6_generate_stable_address(&addr, 0, in6_dev)) { addr_flags |= IFA_F_STABLE_PRIVACY; @@ -3106,6 +3119,17 @@ retry: return 0; } +static void ipv6_gen_mode_random_init(struct inet6_dev *idev) +{ + struct ipv6_stable_secret *s = &idev->cnf.stable_secret; + + if (s->initialized) + return; + s = &idev->cnf.stable_secret; + get_random_bytes(&s->secret, sizeof(s->secret)); + s->initialized = true; +} + static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { struct in6_addr addr; @@ -3116,13 +3140,18 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { + switch (idev->addr_gen_mode) { + case IN6_ADDR_GEN_MODE_RANDOM: + ipv6_gen_mode_random_init(idev); + /* fallthrough */ + case IN6_ADDR_GEN_MODE_STABLE_PRIVACY: if (!ipv6_generate_stable_address(&addr, 0, idev)) addrconf_add_linklocal(idev, &addr, IFA_F_STABLE_PRIVACY); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); - } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { + break; + case IN6_ADDR_GEN_MODE_EUI64: /* addrconf_add_linklocal also adds a prefix_route and we * only need to care about prefix routes if ipv6_generate_eui64 * couldn't generate one. @@ -3131,6 +3160,11 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + break; + case IN6_ADDR_GEN_MODE_NONE: + default: + /* will not add any link local address */ + break; } } @@ -3148,6 +3182,7 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && (dev->type != ARPHRD_6LOWPAN) && + (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP) && (dev->type != ARPHRD_INFINIBAND)) { /* Alas, we support only Ethernet autoconfiguration. */ @@ -3158,6 +3193,11 @@ static void addrconf_dev_config(struct net_device *dev) if (IS_ERR(idev)) return; + /* this device type has no EUI support */ + if (dev->type == ARPHRD_NONE && + idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) + idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; + addrconf_addr_gen(idev, false); } @@ -3215,6 +3255,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_changeupper_info *info; struct inet6_dev *idev = __in6_dev_get(dev); struct net *net = dev_net(dev); int run_pending = 0; @@ -3376,6 +3417,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_POST_TYPE_CHANGE: addrconf_type_change(dev, event); break; + + case NETDEV_CHANGEUPPER: + info = ptr; + + /* flush all routes if dev is linked to or unlinked from + * an L3 master device (e.g., VRF) + */ + if (info->upper_dev && netif_is_l3_master(info->upper_dev)) + addrconf_ifdown(dev, 0); } return NOTIFY_OK; @@ -4772,6 +4822,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; + array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; + array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; } static inline size_t inet6_ifla6_size(void) @@ -5022,7 +5074,8 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (mode != IN6_ADDR_GEN_MODE_EUI64 && mode != IN6_ADDR_GEN_MODE_NONE && - mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY) + mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + mode != IN6_ADDR_GEN_MODE_RANDOM) return -EINVAL; if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && @@ -5859,6 +5912,20 @@ static struct addrconf_sysctl_table .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, }, { + .procname = "drop_unicast_in_l2_multicast", + .data = &ipv6_devconf.drop_unicast_in_l2_multicast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_unsolicited_na", + .data = &ipv6_devconf.drop_unsolicited_na, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { /* sentinel */ } }, diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index bfa941fc1165..129324b36fb6 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -107,15 +107,16 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v) } EXPORT_SYMBOL(inet6addr_notifier_call_chain); -static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, - struct dst_entry **u2, - struct flowi6 *u3) +static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + const struct in6_addr *final_dst) { - return -EAFNOSUPPORT; + return ERR_PTR(-EAFNOSUPPORT); } const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { - .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 28e03fce9e89..091a93e30e2b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -250,7 +250,11 @@ lookup_protocol: * creation time automatically shares. */ inet->inet_sport = htons(inet->inet_num); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); @@ -259,6 +263,14 @@ lookup_protocol: goto out; } } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { + sk_common_release(sk); + goto out; + } + } out: return err; out_rcu_unlock: @@ -277,6 +289,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; + bool saved_ipv6only; int addr_type = 0; int err = 0; @@ -396,19 +409,21 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; + saved_ipv6only = sk->sk_ipv6only; + if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED) + sk->sk_ipv6only = 1; + /* Make sure we are allowed to bind here. */ if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; goto out; } - if (addr_type != IPV6_ADDR_ANY) { + if (addr_type != IPV6_ADDR_ANY) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; - if (addr_type != IPV6_ADDR_MAPPED) - sk->sk_ipv6only = 1; - } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); @@ -693,14 +708,14 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { sk->sk_route_caps = 0; sk->sk_err_soft = -PTR_ERR(dst); @@ -858,7 +873,7 @@ static struct pernet_operations inet6_net_ops = { static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, - .ipv6_dst_lookup = ip6_dst_lookup, + .ipv6_dst_lookup_flow = ip6_dst_lookup_flow, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, @@ -1046,11 +1061,11 @@ netfilter_fail: igmp_fail: ndisc_cleanup(); ndisc_fail: - ip6_mr_cleanup(); + icmpv6_cleanup(); icmp_fail: - unregister_pernet_subsys(&inet6_net_ops); + ip6_mr_cleanup(); ipmr_fail: - icmpv6_cleanup(); + unregister_pernet_subsys(&inet6_net_ops); register_pernet_fail: sock_unregister(PF_INET6); rtnl_unregister_all(PF_INET6); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index e742c4deb13d..98d253d7bed3 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -595,7 +595,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); - if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN)) + err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN); + if (err) goto out_free; ip6h->priority = 0; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 514ac259f543..b831e9b2e906 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -170,7 +170,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) return 0; } -void ipv6_sock_ac_close(struct sock *sk) +void __ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; @@ -178,10 +178,7 @@ void ipv6_sock_ac_close(struct sock *sk) struct net *net = sock_net(sk); int prev_index; - if (!np->ipv6_ac_list) - return; - - rtnl_lock(); + ASSERT_RTNL(); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; @@ -198,6 +195,16 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } +} + +void ipv6_sock_ac_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + if (!np->ipv6_ac_list) + return; + rtnl_lock(); + __ipv6_sock_ac_close(sk); rtnl_unlock(); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 674fc3861e41..7e966fe1ffd6 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -40,7 +40,8 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); @@ -173,14 +174,14 @@ ipv4_connected: if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) fl6.flowi6_oif = np->mcast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); err = 0; if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -219,6 +220,7 @@ out: fl6_sock_release(flowlabel); return err; } +EXPORT_SYMBOL_GPL(__ip6_datagram_connect); int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 44a2010e2076..37268a37312c 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -426,8 +426,10 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) sg_init_table(sg, nfrags); ret = skb_to_sgvec(skb, sg, 0, skb->len); - if (unlikely(ret < 0)) + if (unlikely(ret < 0)) { + kfree(tmp); goto out; + } aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); aead_request_set_ad(req, assoclen); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3ae2fbe07b25..4e6755e99318 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -446,6 +446,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; + else { + dst = skb_dst(skb); + iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev); + } /* * Must not send error if the source does not uniquely @@ -480,7 +484,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); sk = icmpv6_xmit_lock(net); if (!sk) @@ -501,9 +505,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - if (!fl6.flowi6_oif) - fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev); - dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; @@ -588,7 +589,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); sk = icmpv6_xmit_lock(net); if (!sk) @@ -834,7 +835,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } static int __net_init icmpv6_sk_init(struct net *net) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index dc79ebc14189..ceeb3d221db5 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -26,6 +26,7 @@ #include <net/ip6_route.h> #include <net/sock.h> #include <net/inet6_connection_sock.h> +#include <net/sock_reuseport.h> int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) @@ -48,15 +49,16 @@ int inet6_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { - if (ipv6_rcv_saddr_equal(sk, sk2)) + if (ipv6_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN && - ipv6_rcv_saddr_equal(sk, sk2)) + ipv6_rcv_saddr_equal(sk, sk2, true)) break; } } @@ -87,9 +89,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); fl6->flowi6_uid = sk->sk_uid; - security_req_classify_flow(req, flowi6_to_flowi(fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) return NULL; @@ -136,7 +138,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; fl6->flowi6_uid = sk->sk_uid; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); @@ -144,7 +146,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) ip6_dst_store(sk, dst, NULL, NULL); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 21ace5a2bf7c..c27207563d10 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -17,11 +17,13 @@ #include <linux/module.h> #include <linux/random.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -121,7 +123,9 @@ static inline int compute_score(struct sock *sk, struct net *net, } struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { @@ -129,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net, const struct hlist_nulls_node *node; struct sock *result; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -146,6 +151,15 @@ begin: if (reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -163,11 +177,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -177,6 +193,7 @@ begin: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) @@ -184,7 +201,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sock *sk; local_bh_disable(); - sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif); + sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + ntohs(dport), dif); local_bh_enable(); return sk; @@ -274,3 +292,61 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); + +int inet6_hash(struct sock *sk) +{ + int err = 0; + + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); + local_bh_enable(); + } + + return err; +} +EXPORT_SYMBOL_GPL(inet6_hash); + +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal); diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 391a8fedb27e..1132624edee9 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -84,9 +84,12 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). + * + * Note, we are only interested in != 0 or == 0, thus the + * force to int. */ - return skb_checksum_init_zero_check(skb, proto, uh->check, - ip6_compute_pseudo); + return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); } EXPORT_SYMBOL(udp6_csum_init); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bf3824b59597..24e91d438f21 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -780,8 +780,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, found++; break; } - if (rt_can_ecmp) - fallback_ins = fallback_ins ?: ins; + fallback_ins = fallback_ins ?: ins; goto next_iter; } @@ -821,7 +820,9 @@ next_iter: } if (fallback_ins && !found) { - /* No ECMP-able route found, replace first non-ECMP one */ + /* No matching route with same ecmp-able-ness found, replace + * first matching route + */ ins = fallback_ins; iter = *ins; found++; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 6e496c3dd8ef..17eee5083548 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -125,6 +125,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, int dev_type = (gre_proto == htons(ETH_P_TEB)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; + struct net_device *ndev; for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || @@ -227,9 +228,9 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (cand) return cand; - dev = ign->fb_tunnel_dev; - if (dev->flags & IFF_UP) - return netdev_priv(dev); + ndev = READ_ONCE(ign->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -349,7 +350,6 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (!(nt->parms.o_flags & GRE_SEQ)) dev->features |= NETIF_F_LLTX; - dev_hold(dev); ip6gre_tunnel_link(ign, nt); return nt; @@ -364,6 +364,8 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, t); + if (ign->fb_tunnel_dev == dev) + WRITE_ONCE(ign->fb_tunnel_dev, NULL); dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -1311,8 +1313,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) strcpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; - - dev_hold(dev); } @@ -1356,15 +1356,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) static int __net_init ip6gre_init_net(struct net *net) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct net_device *ndev; int err; - ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", - NET_NAME_UNKNOWN, - ip6gre_tunnel_setup); - if (!ign->fb_tunnel_dev) { + ndev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", + NET_NAME_UNKNOWN, ip6gre_tunnel_setup); + if (!ndev) { err = -ENOMEM; goto err_alloc_dev; } + ign->fb_tunnel_dev = ndev; dev_net_set(ign->fb_tunnel_dev, net); /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. @@ -1384,7 +1385,7 @@ static int __net_init ip6gre_init_net(struct net *net) return 0; err_reg_dev: - ip6gre_dev_free(ign->fb_tunnel_dev); + ip6gre_dev_free(ndev); err_alloc_dev: return err; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index c83c0faf5ae9..31ac3c56da4b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) goto err; + /* If enabled, drop unicast packets that were encapsulated in link-layer + * multicast or broadcast to protected against the so-called "hole-196" + * attack in 802.11 wireless. + */ + if (!ipv6_addr_is_multicast(&hdr->daddr) && + (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) && + idev->cnf.drop_unicast_in_l2_multicast) + goto err; + /* RFC4291 2.7 * Nodes must not originate a packet to a multicast address whose scope * field contains the reserved value 0; if such a packet is received, it @@ -151,16 +161,6 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (ipv6_addr_is_multicast(&hdr->saddr)) goto err; - /* While RFC4291 is not explicit about v4mapped addresses - * in IPv6 headers, it seems clear linux dual-stack - * model can not deal properly with these. - * Security models could be fooled by ::ffff:127.0.0.1 for example. - * - * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02 - */ - if (ipv6_addr_v4mapped(&hdr->saddr)) - goto err; - skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e39dc94486b2..b824985c714c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -39,6 +39,7 @@ #include <linux/module.h> #include <linux/slab.h> +#include <linux/bpf-cgroup.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -65,9 +66,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * struct in6_addr *nexthop; int ret; - skb->protocol = htons(ETH_P_IPV6); - skb->dev = dev; - if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); @@ -123,6 +121,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) @@ -136,6 +142,9 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + if (unlikely(idev->cnf.disable_ipv6)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); @@ -851,7 +860,6 @@ fail_toobig: if (skb->sk && dst_allfrag(skb_dst(skb))) sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); - skb->dev = skb_dst(skb)->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; @@ -1057,13 +1065,13 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); * It returns a valid dst pointer on success, or a pointer encoded * error code. */ -struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, +struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; - err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); + err = ip6_dst_lookup_tail(net, sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) @@ -1071,7 +1079,7 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, if (!fl6->flowi6_oif) fl6->flowi6_oif = l3mdev_fib_oif(dst->dev); - return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); @@ -1096,7 +1104,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, dst = ip6_sk_dst_check(sk, dst, fl6); if (!dst) - dst = ip6_dst_lookup_flow(sk, fl6, final_dst); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); return dst; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 40cf35c9a2f3..842bdb2d3d8c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -261,7 +261,6 @@ static int ip6_tnl_create2(struct net_device *dev) strcpy(t->parms.name, dev->name); - dev_hold(dev); ip6_tnl_link(ip6n, t); return 0; @@ -918,12 +917,12 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0))) - pr_warn("%s xmit: Local address not yet configured!\n", - p->name); + pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", + p->name); else if (!ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr(net, raddr, NULL, 0))) - pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", - p->name); + pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", + p->name); else ret = 1; rcu_read_unlock(); @@ -972,26 +971,28 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { - struct in6_addr *addr6; - struct neighbour *neigh; - int addr_type; + if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *addr6; + struct neighbour *neigh; + int addr_type; - if (!skb_dst(skb)) - goto tx_err_link_failure; + if (!skb_dst(skb)) + goto tx_err_link_failure; - neigh = dst_neigh_lookup(skb_dst(skb), - &ipv6_hdr(skb)->daddr); - if (!neigh) - goto tx_err_link_failure; + neigh = dst_neigh_lookup(skb_dst(skb), + &ipv6_hdr(skb)->daddr); + if (!neigh) + goto tx_err_link_failure; - addr6 = (struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); + addr6 = (struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); - if (addr_type == IPV6_ADDR_ANY) - addr6 = &ipv6_hdr(skb)->daddr; + if (addr_type == IPV6_ADDR_ANY) + addr6 = &ipv6_hdr(skb)->daddr; - memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); - neigh_release(neigh); + memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); + neigh_release(neigh); + } } else if (!fl6->flowi6_mark) dst = dst_cache_get(&t->dst_cache); @@ -1582,6 +1583,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) return ret; } + dev_hold(dev); return 0; } @@ -1615,7 +1617,6 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->parms.proto = IPPROTO_IPV6; - dev_hold(dev); rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 30b03d8e321a..7ec737d85ccf 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -74,8 +74,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck) + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -99,7 +99,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); - ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5dd544c5cfe2..9df1947e79eb 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -196,7 +196,6 @@ static int vti6_tnl_create2(struct net_device *dev) strcpy(t->parms.name, dev->name); - dev_hold(dev); vti6_tnl_link(ip6n, t); return 0; @@ -315,7 +314,7 @@ static int vti6_rcv(struct sk_buff *skb) if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); - return 0; + goto discard; } if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { @@ -324,11 +323,9 @@ static int vti6_rcv(struct sk_buff *skb) goto discard; } - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - rcu_read_unlock(); - return xfrm6_rcv(skb); + return xfrm6_rcv_tnl(skb, t); } rcu_read_unlock(); return -EINVAL; @@ -441,8 +438,35 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int err = -1; int mtu; - if (!dst) - goto tx_err_link_failure; + if (!dst) { + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) + goto tx_err_link_failure; + dst = &rt->dst; + skb_dst_set(skb, dst); + break; + } + case htons(ETH_P_IPV6): + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + goto tx_err_link_failure; + } + skb_dst_set(skb, dst); + break; + default: + goto tx_err_link_failure; + } + } dst_hold(dst); dst = xfrm_lookup(t->net, dst, fl, NULL, 0); @@ -736,6 +760,8 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); + memset(&p1, 0, sizeof(p1)); + switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { @@ -876,6 +902,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev) dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + dev_hold(dev); return 0; } @@ -907,7 +934,6 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct vti6_net *ip6n = net_generic(net, vti6_net_id); t->parms.proto = IPPROTO_IPV6; - dev_hold(dev); rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 91f16e679f63..20812e8b24dd 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1594,14 +1594,15 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) if (likely(mrt->mroute6_sk == NULL)) { mrt->mroute6_sk = sk; net->ipv6.devconf_all->mc_forwarding++; - inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, - NETCONFA_IFINDEX_ALL, - net->ipv6.devconf_all); - } - else + } else { err = -EADDRINUSE; + } write_unlock_bh(&mrt_lock); + if (!err) + inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); rtnl_unlock(); return err; @@ -1619,11 +1620,11 @@ int ip6mr_sk_done(struct sock *sk) write_lock_bh(&mrt_lock); mrt->mroute6_sk = NULL; net->ipv6.devconf_all->mc_forwarding--; + write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - write_unlock_bh(&mrt_lock); mroute_clean_tables(mrt, false); err = 0; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 71263754b19b..4a75013a2ede 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -185,8 +185,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } else if (sk->sk_protocol != IPPROTO_TCP) + } else if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; + break; + } + } else { break; + } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; @@ -201,6 +207,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, fl6_free_socklist(sk); __ipv6_sock_mc_close(sk); + __ipv6_sock_ac_close(sk); /* * Sock is moving from IPv6 to IPv4 (sk_prot), so diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 976c8133a281..636425999aac 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1573,10 +1573,7 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - /* limit our allocations to order-0 page */ - size = min_t(int, size, SKB_MAX_ORDER(0, 0)); skb = sock_alloc_send_skb(sk, size, 1, &err); - if (!skb) return NULL; @@ -2580,6 +2577,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) write_unlock_bh(&idev->lock); igmp6_group_dropped(i); + ip6_mc_clear_src(i); ma_put(i); write_lock_bh(&idev->lock); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index e16a05ca4879..963ac8e2a448 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -884,6 +884,7 @@ static void ndisc_recv_na(struct sk_buff *skb) offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; @@ -903,6 +904,14 @@ static void ndisc_recv_na(struct sk_buff *skb) return; } + /* For some 802.11 wireless deployments (and possibly other networks), + * there will be a NA proxy and unsolicitd packets are attacks + * and thus should not be accepted. + */ + if (!msg->icmph.icmp6_solicited && idev && + idev->cnf.drop_unsolicited_na) + return; + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 2393e1e09d69..f57b72771e17 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -343,6 +343,7 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + acpar.fragoff = 0; acpar.hotdrop = false; acpar.net = state->net; acpar.in = state->in; @@ -979,10 +980,6 @@ copy_entries_to_user(unsigned int total_size, return PTR_ERR(counters); loc_cpu_entry = private->entries; - if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { - ret = -EFAULT; - goto free_counters; - } /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ @@ -992,6 +989,10 @@ copy_entries_to_user(unsigned int total_size, const struct xt_entry_target *t; e = (struct ip6t_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off, e, sizeof(*e))) { + ret = -EFAULT; + goto free_counters; + } if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -1005,23 +1006,14 @@ copy_entries_to_user(unsigned int total_size, i += m->u.match_size) { m = (void *)e + i; - if (copy_to_user(userptr + off + i - + offsetof(struct xt_entry_match, - u.user.name), - m->u.kernel.match->name, - strlen(m->u.kernel.match->name)+1) - != 0) { + if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ip6t_get_target_c(e); - if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct xt_entry_target, - u.user.name), - t->u.kernel.target->name, - strlen(t->u.kernel.target->name)+1) != 0) { + if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } @@ -1621,6 +1613,8 @@ translate_compat_table(struct net *net, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c index 590f767db5d4..a379d2f79b19 100644 --- a/net/ipv6/netfilter/ip6t_NPT.c +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -112,6 +112,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = { .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), + .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | @@ -123,6 +124,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = { .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), + .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_PRE_ROUTING) | diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 7117e5bef412..96e91bbc9329 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -158,7 +158,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); - security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst == NULL || dst->error) { dst_release(dst); diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 8bfd470cbe72..831f86e1ec08 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -26,7 +26,7 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr, { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); struct in6_addr *gw = (struct in6_addr *)®s->data[priv->sreg_addr]; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif); } @@ -57,7 +57,9 @@ static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 6b896cc9604e..e2de4b0479f6 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -14,29 +14,11 @@ static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { - const struct { - struct in6_addr dst; - struct in6_addr src; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .dst = *dst, - .src = *src, - }; - u32 hash, id; - - /* Note the following code is not safe, but this is okay. */ - if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) - get_random_bytes(&net->ipv4.ip_id_key, - sizeof(net->ipv4.ip_id_key)); - - hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key); - - /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, - * set the hight order instead thus minimizing possible future - * collisions. - */ - id = ip_idents_reserve(hash, 1); - if (unlikely(!id)) - id = 1 << 31; + u32 id; + + do { + id = prandom_u32(); + } while (!id); return id; } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 40b835720722..9fced723d7e6 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -144,7 +144,7 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 30db0167bd19..ebd64b99dc98 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -757,6 +757,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int hlimit = -1; int tclass = -1; int dontfrag = -1; + int hdrincl; u16 proto; int err; @@ -770,6 +771,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. + */ + hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); + /* * Get and verify the address. */ @@ -877,12 +885,12 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = np->mcast_oif; else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - if (inet->hdrincl) + if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; @@ -900,7 +908,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); else { lock_sock(sk); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index f1514e3d89db..7b88fac606e1 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -770,8 +770,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 11f67eaae1c1..decb94219a70 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -336,9 +336,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, return rt; } -static struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) +struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) { struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); @@ -362,6 +362,7 @@ static struct rt6_info *ip6_dst_alloc(struct net *net, return rt; } +EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { @@ -1763,6 +1764,37 @@ static int ip6_convert_metrics(struct mx6_config *mxc, return -EINVAL; } +static struct rt6_info *ip6_nh_lookup_table(struct net *net, + struct fib6_config *cfg, + const struct in6_addr *gw_addr) +{ + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + .saddr = cfg->fc_prefsrc, + }; + struct fib6_table *table; + struct rt6_info *rt; + int flags = 0; + + table = fib6_get_table(net, cfg->fc_table); + if (!table) + return NULL; + + if (!ipv6_addr_any(&cfg->fc_prefsrc)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); + + /* if table lookup failed, fall back to full lookup */ + if (rt == net->ipv6.ip6_null_entry) { + ip6_rt_put(rt); + rt = NULL; + } + + return rt; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { struct net *net = cfg->fc_nlinfo.nl_net; @@ -1938,7 +1970,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - struct rt6_info *grt; + struct rt6_info *grt = NULL; /* IPv6 strictly inhibits using not link-local addresses as nexthop address. @@ -1950,7 +1982,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + if (cfg->fc_table) + grt = ip6_nh_lookup_table(net, cfg, gw_addr); + + if (!grt) + grt = rt6_lookup(net, gw_addr, NULL, + cfg->fc_ifindex, 1); err = -EHOSTUNREACH; if (!grt) @@ -2273,12 +2310,12 @@ static struct rt6_info *rt6_get_route_info(struct net_device *dev, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr) { + u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO); struct fib6_node *fn; struct rt6_info *rt = NULL; struct fib6_table *table; - table = fib6_get_table(dev_net(dev), - addrconf_rt_table(dev, RT6_TABLE_INFO)); + table = fib6_get_table(dev_net(dev), tb_id); if (!table) return NULL; @@ -2317,7 +2354,7 @@ static struct rt6_info *rt6_add_route_info(struct net_device *dev, .fc_nlinfo.nl_net = dev_net(dev), }; - cfg.fc_table = l3mdev_fib_table_by_index(dev_net(dev), dev->ifindex) ? : addrconf_rt_table(dev, RT6_TABLE_INFO); + cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO); cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; @@ -2333,11 +2370,11 @@ static struct rt6_info *rt6_add_route_info(struct net_device *dev, struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) { + u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT); struct rt6_info *rt; struct fib6_table *table; - table = fib6_get_table(dev_net(dev), - addrconf_rt_table(dev, RT6_TABLE_MAIN)); + table = fib6_get_table(dev_net(dev), tb_id); if (!table) return NULL; @@ -2940,8 +2977,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) * nexthops have been replaced by first new, the rest should * be added to it. */ - cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | - NLM_F_REPLACE); + if (cfg->fc_nlinfo.nlh) { + cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | + NLM_F_REPLACE); + cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; + } nhn++; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5039486c4f86..6a5442fdbd51 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -209,8 +209,6 @@ static int ipip6_tunnel_create(struct net_device *dev) dev->rtnl_link_ops = &sit_link_ops; - dev_hold(dev); - ipip6_tunnel_link(sitn, t); return 0; @@ -1079,7 +1077,6 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (tdev && !netif_is_l3_master(tdev)) { int t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); dev->mtu = tdev->mtu - t_hlen; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1371,7 +1368,6 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; netif_keep_dst(dev); @@ -1401,7 +1397,7 @@ static int ipip6_tunnel_init(struct net_device *dev) dev->tstats = NULL; return err; } - + dev_hold(dev); return 0; } @@ -1417,7 +1413,6 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; - dev_hold(dev); rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); } @@ -1586,8 +1581,11 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, } #ifdef CONFIG_IPV6_SIT_6RD - if (ipip6_netlink_6rd_parms(data, &ip6rd)) + if (ipip6_netlink_6rd_parms(data, &ip6rd)) { err = ipip6_tunnel_update_6rd(nt, &ip6rd); + if (err < 0) + unregister_netdevice_queue(dev, NULL); + } #endif return err; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7f3667635431..bc4f37ef9e1d 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -144,7 +144,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct dst_entry *dst; __u8 rcv_wscale; @@ -230,15 +230,21 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; fl6.flowi6_uid = sk->sk_uid; - security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) goto out_free; } req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + tcp_select_initial_window(full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 27d5bc29188a..9de573f89ea7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -244,9 +244,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); final_p = fl6_update_dst(&fl6, opt, &final); - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; @@ -309,11 +309,20 @@ failure: static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; + u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; - dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); + mtu = READ_ONCE(tcp_sk(sk)->mtu_info); + + /* Drop requests trying to increase our current mss. + * Check done in __ip6_rt_update_pmtu() is too late. + */ + if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) + return; + + dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -392,6 +401,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (type == ICMPV6_PKT_TOOBIG) { + u32 mtu = ntohl(info); + /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). @@ -402,7 +413,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!ip6_sk_accept_pmtu(sk)) goto out; - tp->mtu_info = ntohl(info); + if (mtu < IPV6_MIN_MTU) + goto out; + + WRITE_ONCE(tp->mtu_info, mtu); + if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, @@ -816,19 +831,24 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); - else + else { + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; + fl6.flowi6_oif = oif; + } + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); @@ -875,7 +895,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, &ipv6h->saddr, + &tcp_hashinfo, NULL, 0, + &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) @@ -975,6 +996,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; + if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { + IP6_INC_STATS_BH(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); + return 0; + } + return tcp_conn_request(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, skb); @@ -1227,9 +1253,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); - if (tcp_filter(sk, skb)) - goto discard; - /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1398,8 +1421,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, - inet6_iif(skb)); + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + th->source, th->dest, inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1530,6 +1553,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif(skb)); @@ -1899,7 +1923,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e9db0ff606b9..e670c45829bd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> +#include <net/addrconf.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> @@ -48,6 +49,7 @@ #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/busy_poll.h> +#include <net/sock_reuseport.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -77,34 +79,6 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) -{ - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) - return (!sk2_ipv6only && - (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr || - sk->sk_rcv_saddr == sk2->sk_rcv_saddr)); - - if (addr_type2 == IPV6_ADDR_ANY && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) - return 1; - - return 0; -} - static u32 udp6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) @@ -236,28 +210,32 @@ static inline int compute_score2(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2) + struct udp_hslot *hslot2, unsigned int slot2, + struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; -begin: result = NULL; badness = -1; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + + result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -265,40 +243,23 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } +/* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, + struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -308,7 +269,7 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); if (!result) { hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; @@ -318,25 +279,28 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = -1; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -344,24 +308,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, saddr, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp6_lib_lookup); @@ -378,15 +324,27 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, return sk; return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - udptable); + udptable, skb); } +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { - return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + struct sock *sk; + + sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp6_lib_lookup); +#endif /* * This should be easy, if there is something there we @@ -551,8 +509,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int err; struct net *net = dev_net(skb->dev); - sk = __udp6_lib_lookup(net, daddr, uh->dest, - saddr, uh->source, inet6_iif(skb), udptable); + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, + inet6_iif(skb), udptable, NULL); if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -583,7 +541,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -673,7 +631,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ - if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { if (up->pcrlen == 0) { /* full coverage was set */ net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n", @@ -687,10 +645,8 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (rcu_access_pointer(sk->sk_filter)) { - if (udp_lib_checksum_complete(skb)) - goto csum_error; - } + if (udp_lib_checksum_complete(skb)) + goto csum_error; if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { UDP6_INC_STATS_BH(sock_net(sk), @@ -745,33 +701,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - struct sk_buff *skb1 = NULL; - struct sock *sk; - unsigned int i; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - static void udp6_csum_zero_error(struct sk_buff *skb) { /* RFC 2460 section 8.1 says that we SHOULD log @@ -790,46 +719,51 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; + struct sock *sk, *first = NULL; const struct udphdr *uh = udp_hdr(skb); - struct hlist_nulls_node *node; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = inet6_iif(skb); - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + int dif = inet6_iif(skb); + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & - udp_table.mask; - hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask; + udptable->mask; + hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udp_table.hash2[hash2]; + hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_v6_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum) && - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - (uh->check || udp_sk(sk)->no_check6_rx)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + if (!uh->check && !udp_sk(sk)->no_check6_rx) + continue; + if (!first) { + first = sk; + continue; + } + nskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; } - } - spin_unlock(&hslot->lock); + if (udpv6_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -837,13 +771,13 @@ start_lookup: goto start_lookup; } - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udpv6_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -851,10 +785,10 @@ start_lookup: int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { + const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); - struct sock *sk; struct udphdr *uh; - const struct in6_addr *saddr, *daddr; + struct sock *sk; u32 ulen = 0; if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -908,7 +842,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int ret; if (!uh->check && !udp_sk(sk)->no_check6_rx) { - sock_put(sk); udp6_csum_zero_error(skb); goto csum_error; } @@ -918,7 +851,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input */ if (ret > 0) @@ -965,7 +897,6 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, int dif) { struct sock *sk; - struct hlist_nulls_node *hnode; unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; @@ -973,7 +904,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); - udp_portaddr_for_each_entry_rcu(sk, hnode, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (sk->sk_state == TCP_ESTABLISHED && INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif)) return sk; @@ -1364,7 +1295,7 @@ do_udp_sendmsg: } else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -1618,7 +1549,6 @@ struct proto udpv6_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index d1eaeeaa34d2..af2895c77ed6 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -50,7 +50,6 @@ struct proto udplitev6_prot = { .unhash = udp_lib_unhash, .get_port = udp_v6_get_port, .obj_size = sizeof(struct udp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 0eaab1fa6be5..b5789562aded 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -21,8 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm6_extract_header(skb); } -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t) { + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); @@ -48,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return -1; } -int xfrm6_rcv(struct sk_buff *skb) +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], - 0); + 0, t); } -EXPORT_SYMBOL(xfrm6_rcv); +EXPORT_SYMBOL(xfrm6_rcv_tnl); +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_tnl(skb, NULL); +} +EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 64862c5084ee..ef6cc9eb0e45 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -125,9 +125,7 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); -#ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif return xfrm_output(sk, skb); } @@ -143,7 +141,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; - int mtu; + unsigned int mtu; bool toobig; #ifdef CONFIG_NETFILTER diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index f9d493c59d6c..07b7b2540579 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -239,7 +239,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 7cc9db38e1b6..0e8f8a3f7b23 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -839,7 +839,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) struct sock *sk = sock->sk; struct irda_sock *new, *self = irda_sk(sk); struct sock *newsk; - struct sk_buff *skb; + struct sk_buff *skb = NULL; int err; err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); @@ -907,7 +907,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) err = -EPERM; /* value does not seem to make sense. -arnd */ if (!new->tsap) { pr_debug("%s(), dup failed!\n", __func__); - kfree_skb(skb); goto out; } @@ -926,7 +925,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) /* Clean up the original one to keep it in listen state */ irttp_listen(self->tsap); - kfree_skb(skb); sk->sk_ack_backlog--; newsock->state = SS_CONNECTED; @@ -934,6 +932,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) irda_connect_response(new); err = 0; out: + kfree_skb(skb); release_sock(sk); return err; } diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 5984cc35d508..78a4b9dd6167 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1513,7 +1513,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how) break; } - if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) { + if ((how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) && + sk->sk_state == IUCV_CONNECTED) { if (iucv->transport == AF_IUCV_TRANS_IUCV) { txmsg.class = 0; txmsg.tag = 0; @@ -1723,7 +1724,7 @@ static int iucv_callback_connreq(struct iucv_path *path, } /* Create the new socket */ - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); + nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); if (!nsk) { err = pr_iucv->path_sever(path, user_data); iucv_path_free(path); @@ -1933,7 +1934,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) goto out; } - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); + nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0); bh_lock_sock(sk); if ((sk->sk_state != IUCV_LISTEN) || sk_acceptq_is_full(sk) || @@ -2392,6 +2393,13 @@ out: return err; } +static void afiucv_iucv_exit(void) +{ + device_unregister(af_iucv_dev); + driver_unregister(&af_iucv_driver); + pr_iucv->iucv_unregister(&af_iucv_handler, 0); +} + static int __init afiucv_init(void) { int err; @@ -2425,11 +2433,18 @@ static int __init afiucv_init(void) err = afiucv_iucv_init(); if (err) goto out_sock; - } else - register_netdevice_notifier(&afiucv_netdev_notifier); + } + + err = register_netdevice_notifier(&afiucv_netdev_notifier); + if (err) + goto out_notifier; + dev_add_pack(&iucv_packet_type); return 0; +out_notifier: + if (pr_iucv) + afiucv_iucv_exit(); out_sock: sock_unregister(PF_IUCV); out_proto: @@ -2443,12 +2458,11 @@ out: static void __exit afiucv_exit(void) { if (pr_iucv) { - device_unregister(af_iucv_dev); - driver_unregister(&af_iucv_driver); - pr_iucv->iucv_unregister(&af_iucv_handler, 0); + afiucv_iucv_exit(); symbol_put(iucv_if); - } else - unregister_netdevice_notifier(&afiucv_netdev_notifier); + } + + unregister_netdevice_notifier(&afiucv_netdev_notifier); dev_remove_pack(&iucv_packet_type); sock_unregister(PF_IUCV); proto_unregister(&iucv_proto); diff --git a/net/key/af_key.c b/net/key/af_key.c index b400a4cc265b..52ab3c3c11d8 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1863,6 +1863,13 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; + if ((xfilter->sadb_x_filter_splen >= + (sizeof(xfrm_address_t) << 3)) || + (xfilter->sadb_x_filter_dplen >= + (sizeof(xfrm_address_t) << 3))) { + mutex_unlock(&pfk->dump_lock); + return -EINVAL; + } filter = kmalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) { mutex_unlock(&pfk->dump_lock); @@ -2916,7 +2923,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2934,7 +2941,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { @@ -2945,7 +2952,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 429dbb064240..9a85b0133991 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -112,53 +112,19 @@ struct l2tp_net { spinlock_t l2tp_session_hlist_lock; }; -static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) { return sk->sk_user_data; } -static inline struct l2tp_net *l2tp_pernet(struct net *net) +static inline struct l2tp_net *l2tp_pernet(const struct net *net) { BUG_ON(!net); return net_generic(net, l2tp_net_id); } -/* Tunnel reference counts. Incremented per session that is added to - * the tunnel. - */ -static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) -{ - atomic_inc(&tunnel->ref_count); -} - -static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) -{ - if (atomic_dec_and_test(&tunnel->ref_count)) - l2tp_tunnel_free(tunnel); -} -#ifdef L2TP_REFCNT_DEBUG -#define l2tp_tunnel_inc_refcount(_t) \ -do { \ - pr_debug("l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_t)->name, \ - atomic_read(&_t->ref_count)); \ - l2tp_tunnel_inc_refcount_1(_t); \ -} while (0) -#define l2tp_tunnel_dec_refcount(_t) \ -do { \ - pr_debug("l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_t)->name, \ - atomic_read(&_t->ref_count)); \ - l2tp_tunnel_dec_refcount_1(_t); \ -} while (0) -#else -#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) -#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) -#endif - /* Session hash global list for L2TPv3. * The session_id SHOULD be random according to RFC3931, but several * L2TP implementations use incrementing session_ids. So we do a real @@ -216,27 +182,6 @@ static void l2tp_tunnel_sock_put(struct sock *sk) sock_put(sk); } -/* Lookup a session by id in the global session list - */ -static struct l2tp_session *l2tp_session_find_2(struct net *net, u32 session_id) -{ - struct l2tp_net *pn = l2tp_pernet(net); - struct hlist_head *session_list = - l2tp_session_id_hash_2(pn, session_id); - struct l2tp_session *session; - - rcu_read_lock_bh(); - hlist_for_each_entry_rcu(session, session_list, global_hlist) { - if (session->session_id == session_id) { - rcu_read_unlock_bh(); - return session; - } - } - rcu_read_unlock_bh(); - - return NULL; -} - /* Session hash list. * The session_id SHOULD be random according to RFC2661, but several * L2TP implementations (Cisco and Microsoft) use incrementing @@ -249,25 +194,67 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } -/* Lookup a session by id +/* Lookup a tunnel. A new reference is held on the returned tunnel. */ +struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) +{ + const struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_tunnel *tunnel; + + rcu_read_lock_bh(); + list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { + if (tunnel->tunnel_id == tunnel_id) { + l2tp_tunnel_inc_refcount(tunnel); + rcu_read_unlock_bh(); + + return tunnel; + } + } + rcu_read_unlock_bh(); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_get); + +/* Lookup a session. A new reference is held on the returned session. + * Optionally calls session->ref() too if do_ref is true. */ -struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id) +struct l2tp_session *l2tp_session_get(const struct net *net, + struct l2tp_tunnel *tunnel, + u32 session_id, bool do_ref) { struct hlist_head *session_list; struct l2tp_session *session; - /* In L2TPv3, session_ids are unique over all tunnels and we - * sometimes need to look them up before we know the - * tunnel. - */ - if (tunnel == NULL) - return l2tp_session_find_2(net, session_id); + if (!tunnel) { + struct l2tp_net *pn = l2tp_pernet(net); + + session_list = l2tp_session_id_hash_2(pn, session_id); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu(session, session_list, global_hlist) { + if (session->session_id == session_id) { + l2tp_session_inc_refcount(session); + if (do_ref && session->ref) + session->ref(session); + rcu_read_unlock_bh(); + + return session; + } + } + rcu_read_unlock_bh(); + + return NULL; + } session_list = l2tp_session_id_hash(tunnel, session_id); read_lock_bh(&tunnel->hlist_lock); hlist_for_each_entry(session, session_list, hlist) { if (session->session_id == session_id) { + l2tp_session_inc_refcount(session); + if (do_ref && session->ref) + session->ref(session); read_unlock_bh(&tunnel->hlist_lock); + return session; } } @@ -275,7 +262,7 @@ struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunn return NULL; } -EXPORT_SYMBOL_GPL(l2tp_session_find); +EXPORT_SYMBOL_GPL(l2tp_session_get); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth, bool do_ref) @@ -306,7 +293,9 @@ EXPORT_SYMBOL_GPL(l2tp_session_get_nth); /* Lookup a session by interface name. * This is very inefficient but is only used by management interfaces. */ -struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname) +struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, + const char *ifname, + bool do_ref) { struct l2tp_net *pn = l2tp_pernet(net); int hash; @@ -316,7 +305,11 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname) for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) { hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) { if (!strcmp(session->ifname, ifname)) { + l2tp_session_inc_refcount(session); + if (do_ref && session->ref) + session->ref(session); rcu_read_unlock_bh(); + return session; } } @@ -326,11 +319,80 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname) return NULL; } -EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname); +EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname); + +int l2tp_session_register(struct l2tp_session *session, + struct l2tp_tunnel *tunnel) +{ + struct l2tp_session *session_walk; + struct hlist_head *g_head; + struct hlist_head *head; + struct l2tp_net *pn; + int err; + + head = l2tp_session_id_hash(tunnel, session->session_id); + + write_lock_bh(&tunnel->hlist_lock); + if (!tunnel->acpt_newsess) { + err = -ENODEV; + goto err_tlock; + } + + hlist_for_each_entry(session_walk, head, hlist) + if (session_walk->session_id == session->session_id) { + err = -EEXIST; + goto err_tlock; + } + + if (tunnel->version == L2TP_HDR_VER_3) { + pn = l2tp_pernet(tunnel->l2tp_net); + g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net), + session->session_id); + + spin_lock_bh(&pn->l2tp_session_hlist_lock); + + /* IP encap expects session IDs to be globally unique, while + * UDP encap doesn't. + */ + hlist_for_each_entry(session_walk, g_head, global_hlist) + if (session_walk->session_id == session->session_id && + (session_walk->tunnel->encap == L2TP_ENCAPTYPE_IP || + tunnel->encap == L2TP_ENCAPTYPE_IP)) { + err = -EEXIST; + goto err_tlock_pnlock; + } + + l2tp_tunnel_inc_refcount(tunnel); + sock_hold(tunnel->sock); + hlist_add_head_rcu(&session->global_hlist, g_head); + + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + } else { + l2tp_tunnel_inc_refcount(tunnel); + sock_hold(tunnel->sock); + } + + hlist_add_head(&session->hlist, head); + write_unlock_bh(&tunnel->hlist_lock); + + /* Ignore management session in session count value */ + if (session->session_id != 0) + atomic_inc(&l2tp_session_count); + + return 0; + +err_tlock_pnlock: + spin_unlock_bh(&pn->l2tp_session_hlist_lock); +err_tlock: + write_unlock_bh(&tunnel->hlist_lock); + + return err; +} +EXPORT_SYMBOL_GPL(l2tp_session_register); /* Lookup a tunnel by id */ -struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id) +struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id) { struct l2tp_tunnel *tunnel; struct l2tp_net *pn = l2tp_pernet(net); @@ -348,7 +410,7 @@ struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id) } EXPORT_SYMBOL_GPL(l2tp_tunnel_find); -struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth) +struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel; @@ -636,6 +698,9 @@ discard: * a data (not control) frame before coming here. Fields up to the * session-id have already been parsed and ptr points to the data * after the session-id. + * + * session->ref() must have been called prior to l2tp_recv_common(). + * session->deref() will be called automatically after skb is processed. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, @@ -645,14 +710,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, int offset; u32 ns, nr; - /* The ref count is increased since we now hold a pointer to - * the session. Take care to decrement the refcnt when exiting - * this function from now on... - */ - l2tp_session_inc_refcount(session); - if (session->ref) - (*session->ref)(session); - /* Parse and check optional cookie */ if (session->peer_cookie_len > 0) { if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { @@ -803,8 +860,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, /* Try to dequeue as many skbs from reorder_q as we can. */ l2tp_recv_dequeue(session); - l2tp_session_dec_refcount(session); - return; discard: @@ -813,8 +868,6 @@ discard: if (session->deref) (*session->deref)(session); - - l2tp_session_dec_refcount(session); } EXPORT_SYMBOL(l2tp_recv_common); @@ -921,8 +974,14 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, } /* Find the session context */ - session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id); + session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id, true); if (!session || !session->recv_skb) { + if (session) { + if (session->deref) + session->deref(session); + l2tp_session_dec_refcount(session); + } + /* Not found? Pass to userspace to deal with */ l2tp_info(tunnel, L2TP_MSG_DATA, "%s: no session found (%u/%u). Passing up.\n", @@ -931,10 +990,13 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, } if (tunnel->version == L2TP_HDR_VER_3 && - l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) + l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { + l2tp_session_dec_refcount(session); goto error; + } l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook); + l2tp_session_dec_refcount(session); return 0; @@ -1079,6 +1141,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, /* Queue the packet to IP for output */ skb->ignore_df = 1; + skb_dst_drop(skb); #if IS_ENABLED(CONFIG_IPV6) if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped) error = inet6_csk_xmit(tunnel->sock, skb, NULL); @@ -1143,10 +1206,6 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len goto out_unlock; } - /* Get routing info from the tunnel socket */ - skb_dst_drop(skb); - skb_dst_set(skb, sk_dst_check(sk, 0)); - inet = inet_sk(sk); fl = &inet->cork.fl; switch (tunnel->encap) { @@ -1218,7 +1277,6 @@ static void l2tp_tunnel_destruct(struct sock *sk) /* Remove hooks into tunnel socket */ sk->sk_destruct = tunnel->old_sk_destruct; sk->sk_user_data = NULL; - tunnel->sock = NULL; /* Remove the tunnel struct from the tunnel list */ pn = l2tp_pernet(tunnel->l2tp_net); @@ -1228,6 +1286,8 @@ static void l2tp_tunnel_destruct(struct sock *sk) atomic_dec(&l2tp_tunnel_count); l2tp_tunnel_closeall(tunnel); + + tunnel->sock = NULL; l2tp_tunnel_dec_refcount(tunnel); /* Call the original destructor */ @@ -1252,6 +1312,7 @@ void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) tunnel->name); write_lock_bh(&tunnel->hlist_lock); + tunnel->acpt_newsess = false; for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { again: hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) { @@ -1262,6 +1323,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1302,17 +1366,6 @@ static void l2tp_udp_encap_destroy(struct sock *sk) } } -/* Really kill the tunnel. - * Come here only when all sessions have been cleared from the tunnel. - */ -static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) -{ - BUG_ON(atomic_read(&tunnel->ref_count) != 0); - BUG_ON(tunnel->sock != NULL); - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: free...\n", tunnel->name); - kfree_rcu(tunnel, rcu); -} - /* Workqueue tunnel deletion function */ static void l2tp_tunnel_del_work(struct work_struct *work) { @@ -1524,6 +1577,8 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 tunnel_id, fd); goto err; } + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + goto err; switch (encap) { case L2TP_ENCAPTYPE_UDP: if (sk->sk_protocol != IPPROTO_UDP) { @@ -1563,6 +1618,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 tunnel->magic = L2TP_TUNNEL_MAGIC; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); rwlock_init(&tunnel->hlist_lock); + tunnel->acpt_newsess = true; /* The net we belong to */ tunnel->l2tp_net = net; @@ -1710,6 +1766,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); @@ -1800,37 +1859,12 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn l2tp_session_set_header_len(session, tunnel->version); - /* Bump the reference count. The session context is deleted - * only when this drops to zero. - */ l2tp_session_inc_refcount(session); - l2tp_tunnel_inc_refcount(tunnel); - - /* Ensure tunnel socket isn't deleted */ - sock_hold(tunnel->sock); - /* Add session to the tunnel's hash list */ - write_lock_bh(&tunnel->hlist_lock); - hlist_add_head(&session->hlist, - l2tp_session_id_hash(tunnel, session_id)); - write_unlock_bh(&tunnel->hlist_lock); - - /* And to the global session list if L2TPv3 */ - if (tunnel->version != L2TP_HDR_VER_2) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - - spin_lock_bh(&pn->l2tp_session_hlist_lock); - hlist_add_head_rcu(&session->global_hlist, - l2tp_session_id_hash_2(pn, session_id)); - spin_unlock_bh(&pn->l2tp_session_hlist_lock); - } - - /* Ignore management session in session count value */ - if (session->session_id != 0) - atomic_inc(&l2tp_session_count); + return session; } - return session; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(l2tp_session_create); @@ -1864,6 +1898,9 @@ static __net_exit void l2tp_exit_net(struct net *net) l2tp_tunnel_delete(tunnel); } rcu_read_unlock_bh(); + + flush_workqueue(l2tp_wq); + rcu_barrier(); } static struct pernet_operations l2tp_net_ops = { diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index fad47e9d74bc..57da0f1d62dd 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -23,16 +23,6 @@ #define L2TP_HASH_BITS_2 8 #define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2) -/* Debug message categories for the DEBUG socket option */ -enum { - L2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if - * compiled in) */ - L2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel - * interface */ - L2TP_MSG_SEQ = (1 << 2), /* sequence numbers */ - L2TP_MSG_DATA = (1 << 3), /* data packets */ -}; - struct sk_buff; struct l2tp_stats { @@ -85,6 +75,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ @@ -174,6 +165,10 @@ struct l2tp_tunnel { struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ + bool acpt_newsess; /* Indicates whether this + * tunnel accepts new sessions. + * Protected by hlist_lock. + */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; /* hashed list of sessions, * hashed by id */ @@ -209,7 +204,9 @@ struct l2tp_tunnel { }; struct l2tp_nl_cmd_ops { - int (*session_create)(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); + int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel, + u32 session_id, u32 peer_session_id, + struct l2tp_session_cfg *cfg); int (*session_delete)(struct l2tp_session *session); }; @@ -243,14 +240,18 @@ out: return tunnel; } -struct l2tp_session *l2tp_session_find(struct net *net, - struct l2tp_tunnel *tunnel, - u32 session_id); +struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); + +struct l2tp_session *l2tp_session_get(const struct net *net, + struct l2tp_tunnel *tunnel, + u32 session_id, bool do_ref); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth, bool do_ref); -struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname); -struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id); -struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth); +struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, + const char *ifname, + bool do_ref); +struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id); +struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth); int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, @@ -261,6 +262,9 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); +int l2tp_session_register(struct l2tp_session *session, + struct l2tp_tunnel *tunnel); + void __l2tp_session_unhash(struct l2tp_session *session); int l2tp_session_delete(struct l2tp_session *session); void l2tp_session_free(struct l2tp_session *session); @@ -279,6 +283,17 @@ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); +static inline void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) +{ + atomic_inc(&tunnel->ref_count); +} + +static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) +{ + if (atomic_dec_and_test(&tunnel->ref_count)) + kfree_rcu(tunnel, rcu); +} + /* Session reference counts. Incremented when code obtains a reference * to a session. */ diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index e253c26f31ac..e0a65ee1e830 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -30,6 +30,9 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> #include "l2tp_core.h" @@ -41,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -52,20 +54,9 @@ struct l2tp_eth { /* via l2tp_session_priv() */ struct l2tp_eth_sess { - struct net_device *dev; -}; - -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; + struct net_device __rcu *dev; }; -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static struct lock_class_key l2tp_eth_tx_busylock; static int l2tp_eth_dev_init(struct net_device *dev) @@ -82,12 +73,13 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); + struct l2tp_eth_sess *spriv; - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); - dev_put(dev); + spriv = l2tp_session_priv(priv->session); + RCU_INIT_POINTER(spriv->dev, NULL); + /* No need for synchronize_net() here. We're called by + * unregister_netdev*(), which does the synchronisation for us. + */ } static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) @@ -141,8 +133,8 @@ static void l2tp_eth_dev_setup(struct net_device *dev) static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { struct l2tp_eth_sess *spriv = l2tp_session_priv(session); - struct net_device *dev = spriv->dev; - struct l2tp_eth *priv = netdev_priv(dev); + struct net_device *dev; + struct l2tp_eth *priv; if (session->debug & L2TP_MSG_DATA) { unsigned int length; @@ -166,16 +158,25 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, skb_dst_drop(skb); nf_reset(skb); + rcu_read_lock(); + dev = rcu_dereference(spriv->dev); + if (!dev) + goto error_rcu; + + priv = netdev_priv(dev); if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) { atomic_long_inc(&priv->rx_packets); atomic_long_add(data_len, &priv->rx_bytes); } else { atomic_long_inc(&priv->rx_errors); } + rcu_read_unlock(); + return; +error_rcu: + rcu_read_unlock(); error: - atomic_long_inc(&priv->rx_errors); kfree_skb(skb); } @@ -186,11 +187,15 @@ static void l2tp_eth_delete(struct l2tp_session *session) if (session) { spriv = l2tp_session_priv(session); - dev = spriv->dev; + + rtnl_lock(); + dev = rtnl_dereference(spriv->dev); if (dev) { - unregister_netdev(dev); - spriv->dev = NULL; + unregister_netdevice(dev); + rtnl_unlock(); module_put(THIS_MODULE); + } else { + rtnl_unlock(); } } } @@ -200,41 +205,89 @@ static void l2tp_eth_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; struct l2tp_eth_sess *spriv = l2tp_session_priv(session); - struct net_device *dev = spriv->dev; + struct net_device *dev; + + rcu_read_lock(); + dev = rcu_dereference(spriv->dev); + if (!dev) { + rcu_read_unlock(); + return; + } + dev_hold(dev); + rcu_read_unlock(); seq_printf(m, " interface %s\n", dev->name); + + dev_put(dev); } #endif -static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) +static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, + struct l2tp_session *session, + struct net_device *dev) +{ + unsigned int overhead = 0; + struct dst_entry *dst; + u32 l3_overhead = 0; + + /* if the encap is UDP, account for UDP header size */ + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { + overhead += sizeof(struct udphdr); + dev->needed_headroom += sizeof(struct udphdr); + } + if (session->mtu != 0) { + dev->mtu = session->mtu; + dev->needed_headroom += session->hdr_len; + return; + } + lock_sock(tunnel->sock); + l3_overhead = kernel_sock_ip_overhead(tunnel->sock); + release_sock(tunnel->sock); + if (l3_overhead == 0) { + /* L3 Overhead couldn't be identified, this could be + * because tunnel->sock was NULL or the socket's + * address family was not IPv4 or IPv6, + * dev mtu stays at 1500. + */ + return; + } + /* Adjust MTU, factor overhead - underlay L3, overlay L2 hdr + * UDP overhead, if any, was already factored in above. + */ + overhead += session->hdr_len + ETH_HLEN + l3_overhead; + + /* If PMTU discovery was enabled, use discovered MTU on L2TP device */ + dst = sk_dst_get(tunnel->sock); + if (dst) { + /* dst_mtu will use PMTU if found, else fallback to intf MTU */ + u32 pmtu = dst_mtu(dst); + + if (pmtu != 0) + dev->mtu = pmtu; + dst_release(dst); + } + session->mtu = dev->mtu - overhead; + dev->mtu = session->mtu; + dev->needed_headroom += session->hdr_len; +} + +static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, + u32 session_id, u32 peer_session_id, + struct l2tp_session_cfg *cfg) { struct net_device *dev; char name[IFNAMSIZ]; - struct l2tp_tunnel *tunnel; struct l2tp_session *session; struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; - - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (!tunnel) { - rc = -ENODEV; - goto out; - } - - session = l2tp_session_find(net, tunnel, session_id); - if (session) { - rc = -EEXIST; - goto out; - } if (cfg->ifname) { dev = dev_get_by_name(net, cfg->ifname); if (dev) { dev_put(dev); rc = -EEXIST; - goto out; + goto err; } strlcpy(name, cfg->ifname, IFNAMSIZ); } else @@ -242,28 +295,24 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p session = l2tp_session_create(sizeof(*spriv), tunnel, session_id, peer_session_id, cfg); - if (!session) { - rc = -ENOMEM; - goto out; + if (IS_ERR(session)) { + rc = PTR_ERR(session); + goto err; } dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, l2tp_eth_dev_setup); if (!dev) { rc = -ENOMEM; - goto out_del_session; + goto err_sess; } dev_net_set(dev, net); - if (session->mtu == 0) - session->mtu = dev->mtu - session->hdr_len; - dev->mtu = session->mtu; - dev->needed_headroom += session->hdr_len; + l2tp_eth_adjust_mtu(tunnel, session, dev); priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -273,48 +322,50 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p #endif spriv = l2tp_session_priv(session); - spriv->dev = dev; - rc = register_netdev(dev); - if (rc < 0) - goto out_del_dev; + l2tp_session_inc_refcount(session); - __module_get(THIS_MODULE); - /* Must be done after register_netdev() */ - strlcpy(session->ifname, dev->name, IFNAMSIZ); + rtnl_lock(); - dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); + /* Register both device and session while holding the rtnl lock. This + * ensures that l2tp_eth_delete() will see that there's a device to + * unregister, even if it happened to run before we assign spriv->dev. + */ + rc = l2tp_session_register(session, tunnel); + if (rc < 0) { + rtnl_unlock(); + goto err_sess_dev; + } - return 0; + rc = register_netdevice(dev); + if (rc < 0) { + rtnl_unlock(); + l2tp_session_delete(session); + l2tp_session_dec_refcount(session); + free_netdev(dev); -out_del_dev: - free_netdev(dev); - spriv->dev = NULL; -out_del_session: - l2tp_session_delete(session); -out: - return rc; -} + return rc; + } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); + strlcpy(session->ifname, dev->name, IFNAMSIZ); + rcu_assign_pointer(spriv->dev, dev); - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); + rtnl_unlock(); + + l2tp_session_dec_refcount(session); + + __module_get(THIS_MODULE); return 0; -} -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; +err_sess_dev: + l2tp_session_dec_refcount(session); + free_netdev(dev); +err_sess: + kfree(session); +err: + return rc; +} static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { @@ -329,25 +380,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 7efb3cadc152..3a31370d568a 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -24,7 +24,6 @@ #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> -#include <net/inet_hashtables.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> @@ -122,6 +121,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; + struct iphdr *iph; int length; if (!pskb_may_pull(skb, 4)) @@ -142,19 +142,19 @@ static int l2tp_ip_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_find(net, NULL, session_id); - if (session == NULL) + session = l2tp_session_get(net, NULL, session_id, true); + if (!session) goto discard; tunnel = session->tunnel; - if (tunnel == NULL) - goto discard; + if (!tunnel) + goto discard_sess; /* Trace packet contents, if enabled */ if (tunnel->debug & L2TP_MSG_DATA) { length = min(32u, skb->len); if (!pskb_may_pull(skb, length)) - goto discard; + goto discard_sess; /* Point to L2TP header */ optr = ptr = skb->data; @@ -167,6 +167,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) goto discard; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); + l2tp_session_dec_refcount(session); return 0; @@ -179,23 +180,16 @@ pass_up: goto discard; tunnel_id = ntohl(*(__be32 *) &skb->data[4]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel) { - sk = tunnel->sock; - sock_hold(sk); - } else { - struct iphdr *iph = (struct iphdr *) skb_network_header(skb); + iph = (struct iphdr *)skb_network_header(skb); - read_lock_bh(&l2tp_ip_lock); - sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id); - if (!sk) { - read_unlock_bh(&l2tp_ip_lock); - goto discard; - } - - sock_hold(sk); + read_lock_bh(&l2tp_ip_lock); + sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id); + if (!sk) { read_unlock_bh(&l2tp_ip_lock); + goto discard; } + sock_hold(sk); + read_unlock_bh(&l2tp_ip_lock); if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; @@ -204,6 +198,12 @@ pass_up: return sk_receive_skb(sk, skb, 1); +discard_sess: + if (session->deref) + session->deref(session); + l2tp_session_dec_refcount(session); + goto discard; + discard_put: sock_put(sk); @@ -212,15 +212,31 @@ discard: return 0; } -static int l2tp_ip_open(struct sock *sk) +static int l2tp_ip_hash(struct sock *sk) { - /* Prevent autobind. We don't have ports. */ - inet_sk(sk)->inet_num = IPPROTO_L2TP; + if (sk_unhashed(sk)) { + write_lock_bh(&l2tp_ip_lock); + sk_add_node(sk, &l2tp_ip_table); + write_unlock_bh(&l2tp_ip_lock); + } + return 0; +} +static void l2tp_ip_unhash(struct sock *sk) +{ + if (sk_unhashed(sk)) + return; write_lock_bh(&l2tp_ip_lock); - sk_add_node(sk, &l2tp_ip_table); + sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); +} +static int l2tp_ip_open(struct sock *sk) +{ + /* Prevent autobind. We don't have ports. */ + inet_sk(sk)->inet_num = IPPROTO_L2TP; + + l2tp_ip_hash(sk); return 0; } @@ -262,15 +278,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr->l2tp_family != AF_INET) return -EINVAL; - ret = -EADDRINUSE; - read_lock_bh(&l2tp_ip_lock); - if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, - sk->sk_bound_dev_if, addr->l2tp_conn_id)) - goto out_in_use; - - read_unlock_bh(&l2tp_ip_lock); - lock_sock(sk); + + ret = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; @@ -287,14 +297,22 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ - sk_dst_reset(sk); + write_lock_bh(&l2tp_ip_lock); + if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, + sk->sk_bound_dev_if, addr->l2tp_conn_id)) { + write_unlock_bh(&l2tp_ip_lock); + ret = -EADDRINUSE; + goto out; + } + + sk_dst_reset(sk); l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; - write_lock_bh(&l2tp_ip_lock); sk_add_bind_node(sk, &l2tp_ip_bind_table); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); + ret = 0; sock_reset_flag(sk, SOCK_ZAPPED); @@ -302,11 +320,6 @@ out: release_sock(sk); return ret; - -out_in_use: - read_unlock_bh(&l2tp_ip_lock); - - return ret; } static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -314,21 +327,24 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr; int rc; - if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ - return -EINVAL; - if (addr_len < sizeof(*lsa)) return -EINVAL; if (ipv4_is_multicast(lsa->l2tp_addr.s_addr)) return -EINVAL; - rc = ip4_datagram_connect(sk, uaddr, addr_len); - if (rc < 0) - return rc; - lock_sock(sk); + /* Must bind first - autobinding does not work */ + if (sock_flag(sk, SOCK_ZAPPED)) { + rc = -EINVAL; + goto out_sk; + } + + rc = __ip4_datagram_connect(sk, uaddr, addr_len); + if (rc < 0) + goto out_sk; + l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&l2tp_ip_lock); @@ -336,7 +352,9 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len sk_add_bind_node(sk, &l2tp_ip_bind_table); write_unlock_bh(&l2tp_ip_lock); +out_sk: release_sock(sk); + return rc; } @@ -600,8 +618,8 @@ static struct proto l2tp_ip_prot = { .sendmsg = l2tp_ip_sendmsg, .recvmsg = l2tp_ip_recvmsg, .backlog_rcv = l2tp_ip_backlog_recv, - .hash = inet_hash, - .unhash = inet_unhash, + .hash = l2tp_ip_hash, + .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index c125478981ac..1a8a4e451a5f 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -24,7 +24,6 @@ #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> -#include <net/inet_hashtables.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> @@ -127,12 +126,14 @@ static inline struct sock *l2tp_ip6_bind_lookup(struct net *net, */ static int l2tp_ip6_recv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; + struct ipv6hdr *iph; int length; if (!pskb_may_pull(skb, 4)) @@ -153,19 +154,19 @@ static int l2tp_ip6_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_find(&init_net, NULL, session_id); - if (session == NULL) + session = l2tp_session_get(net, NULL, session_id, true); + if (!session) goto discard; tunnel = session->tunnel; - if (tunnel == NULL) - goto discard; + if (!tunnel) + goto discard_sess; /* Trace packet contents, if enabled */ if (tunnel->debug & L2TP_MSG_DATA) { length = min(32u, skb->len); if (!pskb_may_pull(skb, length)) - goto discard; + goto discard_sess; /* Point to L2TP header */ optr = ptr = skb->data; @@ -179,6 +180,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb) l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); + l2tp_session_dec_refcount(session); + return 0; pass_up: @@ -190,24 +193,16 @@ pass_up: goto discard; tunnel_id = ntohl(*(__be32 *) &skb->data[4]); - tunnel = l2tp_tunnel_find(&init_net, tunnel_id); - if (tunnel) { - sk = tunnel->sock; - sock_hold(sk); - } else { - struct ipv6hdr *iph = ipv6_hdr(skb); - - read_lock_bh(&l2tp_ip6_lock); - sk = __l2tp_ip6_bind_lookup(&init_net, &iph->daddr, - 0, tunnel_id); - if (!sk) { - read_unlock_bh(&l2tp_ip6_lock); - goto discard; - } + iph = ipv6_hdr(skb); - sock_hold(sk); + read_lock_bh(&l2tp_ip6_lock); + sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, 0, tunnel_id); + if (!sk) { read_unlock_bh(&l2tp_ip6_lock); + goto discard; } + sock_hold(sk); + read_unlock_bh(&l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; @@ -216,6 +211,12 @@ pass_up: return sk_receive_skb(sk, skb, 1); +discard_sess: + if (session->deref) + session->deref(session); + l2tp_session_dec_refcount(session); + goto discard; + discard_put: sock_put(sk); @@ -224,15 +225,31 @@ discard: return 0; } -static int l2tp_ip6_open(struct sock *sk) +static int l2tp_ip6_hash(struct sock *sk) { - /* Prevent autobind. We don't have ports. */ - inet_sk(sk)->inet_num = IPPROTO_L2TP; + if (sk_unhashed(sk)) { + write_lock_bh(&l2tp_ip6_lock); + sk_add_node(sk, &l2tp_ip6_table); + write_unlock_bh(&l2tp_ip6_lock); + } + return 0; +} +static void l2tp_ip6_unhash(struct sock *sk) +{ + if (sk_unhashed(sk)) + return; write_lock_bh(&l2tp_ip6_lock); - sk_add_node(sk, &l2tp_ip6_table); + sk_del_node_init(sk); write_unlock_bh(&l2tp_ip6_lock); +} + +static int l2tp_ip6_open(struct sock *sk) +{ + /* Prevent autobind. We don't have ports. */ + inet_sk(sk)->inet_num = IPPROTO_L2TP; + l2tp_ip6_hash(sk); return 0; } @@ -267,7 +284,9 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr; + struct net *net = sock_net(sk); __be32 v4addr = 0; + int bound_dev_if; int addr_type; int err; @@ -286,13 +305,6 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr_type & IPV6_ADDR_MULTICAST) return -EADDRNOTAVAIL; - err = -EADDRINUSE; - read_lock_bh(&l2tp_ip6_lock); - if (__l2tp_ip6_bind_lookup(&init_net, &addr->l2tp_addr, - sk->sk_bound_dev_if, addr->l2tp_conn_id)) - goto out_in_use; - read_unlock_bh(&l2tp_ip6_lock); - lock_sock(sk); err = -EINVAL; @@ -302,28 +314,25 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (sk->sk_state != TCP_CLOSE) goto out_unlock; + bound_dev_if = sk->sk_bound_dev_if; + /* Check if the address belongs to the host. */ rcu_read_lock(); if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (addr_len >= sizeof(struct sockaddr_in6) && - addr->l2tp_scope_id) { - /* Override any existing binding, if another - * one is supplied by user. - */ - sk->sk_bound_dev_if = addr->l2tp_scope_id; - } + if (addr->l2tp_scope_id) + bound_dev_if = addr->l2tp_scope_id; /* Binding to link-local address requires an - interface */ - if (!sk->sk_bound_dev_if) + * interface. + */ + if (!bound_dev_if) goto out_unlock_rcu; err = -ENODEV; - dev = dev_get_by_index_rcu(sock_net(sk), - sk->sk_bound_dev_if); + dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if); if (!dev) goto out_unlock_rcu; } @@ -338,13 +347,22 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) } rcu_read_unlock(); - inet->inet_rcv_saddr = inet->inet_saddr = v4addr; + write_lock_bh(&l2tp_ip6_lock); + if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if, + addr->l2tp_conn_id)) { + write_unlock_bh(&l2tp_ip6_lock); + err = -EADDRINUSE; + goto out_unlock; + } + + inet->inet_saddr = v4addr; + inet->inet_rcv_saddr = v4addr; + sk->sk_bound_dev_if = bound_dev_if; sk->sk_v6_rcv_saddr = addr->l2tp_addr; np->saddr = addr->l2tp_addr; l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; - write_lock_bh(&l2tp_ip6_lock); sk_add_bind_node(sk, &l2tp_ip6_bind_table); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip6_lock); @@ -357,10 +375,7 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock: release_sock(sk); - return err; -out_in_use: - read_unlock_bh(&l2tp_ip6_lock); return err; } @@ -373,9 +388,6 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_type; int rc; - if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ - return -EINVAL; - if (addr_len < sizeof(*lsa)) return -EINVAL; @@ -392,10 +404,18 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, return -EINVAL; } - rc = ip6_datagram_connect(sk, uaddr, addr_len); - lock_sock(sk); + /* Must bind first - autobinding does not work */ + if (sock_flag(sk, SOCK_ZAPPED)) { + rc = -EINVAL; + goto out_sk; + } + + rc = __ip6_datagram_connect(sk, uaddr, addr_len); + if (rc < 0) + goto out_sk; + l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&l2tp_ip6_lock); @@ -403,6 +423,7 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, sk_add_bind_node(sk, &l2tp_ip6_bind_table); write_unlock_bh(&l2tp_ip6_lock); +out_sk: release_sock(sk); return rc; @@ -461,7 +482,7 @@ static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) return 0; drop: - IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } @@ -608,9 +629,9 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; @@ -725,8 +746,8 @@ static struct proto l2tp_ip6_prot = { .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, - .hash = inet_hash, - .unhash = inet_unhash, + .hash = l2tp_ip6_hash, + .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index fb3248ff8b48..d3a84a181348 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -55,7 +55,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, /* Accessed under genl lock */ static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; -static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info) +static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info, + bool do_ref) { u32 tunnel_id; u32 session_id; @@ -66,14 +67,17 @@ static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info) if (info->attrs[L2TP_ATTR_IFNAME]) { ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); - session = l2tp_session_find_by_ifname(net, ifname); + session = l2tp_session_get_by_ifname(net, ifname, do_ref); } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && (info->attrs[L2TP_ATTR_CONN_ID])) { tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel) - session = l2tp_session_find(net, tunnel, session_id); + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (tunnel) { + session = l2tp_session_get(net, tunnel, session_id, + do_ref); + l2tp_tunnel_dec_refcount(tunnel); + } } return session; @@ -276,8 +280,8 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { ret = -ENODEV; goto out; } @@ -287,6 +291,8 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info l2tp_tunnel_delete(tunnel); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } @@ -304,8 +310,8 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { ret = -ENODEV; goto out; } @@ -316,6 +322,8 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } @@ -420,34 +428,37 @@ static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; - goto out; + goto err; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { - ret = -ENODEV; - goto out; - } - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; - goto out; + goto err; + } + + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { + ret = -ENODEV; + goto err_nlmsg; } ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, L2TP_CMD_TUNNEL_GET); if (ret < 0) - goto err_out; + goto err_nlmsg_tunnel; + + l2tp_tunnel_dec_refcount(tunnel); return genlmsg_unicast(net, msg, info->snd_portid); -err_out: +err_nlmsg_tunnel: + l2tp_tunnel_dec_refcount(tunnel); +err_nlmsg: nlmsg_free(msg); - -out: +err: return ret; } @@ -491,8 +502,9 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf ret = -EINVAL; goto out; } + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); + tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; @@ -500,29 +512,24 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (!info->attrs[L2TP_ATTR_SESSION_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); - session = l2tp_session_find(net, tunnel, session_id); - if (session) { - ret = -EEXIST; - goto out; - } if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PW_TYPE]) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]); if (cfg.pw_type >= __L2TP_PWTYPE_MAX) { ret = -EINVAL; - goto out; + goto out_tunnel; } if (tunnel->version > 2) { @@ -544,7 +551,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); if (len > 8) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.cookie_len = len; memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len); @@ -553,7 +560,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); if (len > 8) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.peer_cookie_len = len; memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len); @@ -596,7 +603,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { ret = -EPROTONOSUPPORT; - goto out; + goto out_tunnel; } /* Check that pseudowire-specific params are present */ @@ -606,7 +613,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf case L2TP_PWTYPE_ETH_VLAN: if (!info->attrs[L2TP_ATTR_VLAN_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } break; case L2TP_PWTYPE_ETH: @@ -620,18 +627,22 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf break; } - ret = -EPROTONOSUPPORT; - if (l2tp_nl_cmd_ops[cfg.pw_type]->session_create) - ret = (*l2tp_nl_cmd_ops[cfg.pw_type]->session_create)(net, tunnel_id, - session_id, peer_session_id, &cfg); + ret = l2tp_nl_cmd_ops[cfg.pw_type]->session_create(net, tunnel, + session_id, + peer_session_id, + &cfg); if (ret >= 0) { - session = l2tp_session_find(net, tunnel, session_id); - if (session) + session = l2tp_session_get(net, tunnel, session_id, false); + if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); + l2tp_session_dec_refcount(session); + } } +out_tunnel: + l2tp_tunnel_dec_refcount(tunnel); out: return ret; } @@ -642,7 +653,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf struct l2tp_session *session; u16 pw_type; - session = l2tp_nl_session_find(info); + session = l2tp_nl_session_get(info, true); if (session == NULL) { ret = -ENODEV; goto out; @@ -656,6 +667,10 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session); + if (session->deref) + session->deref(session); + l2tp_session_dec_refcount(session); + out: return ret; } @@ -665,7 +680,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf int ret = 0; struct l2tp_session *session; - session = l2tp_nl_session_find(info); + session = l2tp_nl_session_get(info, false); if (session == NULL) { ret = -ENODEV; goto out; @@ -700,6 +715,8 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_MODIFY); + l2tp_session_dec_refcount(session); + out: return ret; } @@ -786,29 +803,34 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; int ret; - session = l2tp_nl_session_find(info); + session = l2tp_nl_session_get(info, false); if (session == NULL) { ret = -ENODEV; - goto out; + goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; - goto out; + goto err_ref; } ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, 0, session, L2TP_CMD_SESSION_GET); if (ret < 0) - goto err_out; + goto err_ref_msg; - return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); + ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); -err_out: - nlmsg_free(msg); + l2tp_session_dec_refcount(session); -out: + return ret; + +err_ref_msg: + nlmsg_free(msg); +err_ref: + l2tp_session_dec_refcount(session); +err: return ret; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index d3f1222c1a8c..8ff5352bb0e3 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -122,8 +122,11 @@ struct pppol2tp_session { int owner; /* pid that opened the socket */ - struct sock *sock; /* Pointer to the session + struct mutex sk_lock; /* Protects .sk */ + struct sock __rcu *sk; /* Pointer to the session * PPPoX socket */ + struct sock *__sk; /* Copy of .sk, for cleanup */ + struct rcu_head rcu; /* For asynchronous release */ struct sock *tunnel_sock; /* Pointer to the tunnel UDP * socket */ int flags; /* accessed by PPPIOCGFLAGS. @@ -138,6 +141,24 @@ static const struct ppp_channel_ops pppol2tp_chan_ops = { static const struct proto_ops pppol2tp_ops; +/* Retrieves the pppol2tp socket associated to a session. + * A reference is held on the returned socket, so this function must be paired + * with sock_put(). + */ +static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) +{ + struct pppol2tp_session *ps = l2tp_session_priv(session); + struct sock *sk; + + rcu_read_lock(); + sk = rcu_dereference(ps->sk); + if (sk) + sock_hold(sk); + rcu_read_unlock(); + + return sk; +} + /* Helpers to obtain tunnel/session contexts from sockets. */ static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) @@ -177,7 +198,7 @@ static int pppol2tp_recv_payload_hook(struct sk_buff *skb) if (!pskb_may_pull(skb, 2)) return 1; - if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03)) + if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI)) skb_pull(skb, 2); return 0; @@ -224,13 +245,14 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ - sk = ps->sock; + rcu_read_lock(); + sk = rcu_dereference(ps->sk); if (sk == NULL) goto no_sock; if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; - l2tp_dbg(session, PPPOL2TP_MSG_DATA, + l2tp_dbg(session, L2TP_MSG_DATA, "%s: recv %d byte data frame, passing to ppp\n", session->name, data_len); @@ -253,7 +275,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int po = pppox_sk(sk); ppp_input(&po->chan, skb); } else { - l2tp_dbg(session, PPPOL2TP_MSG_DATA, + l2tp_dbg(session, L2TP_MSG_DATA, "%s: recv %d byte data frame, passing to L2TP socket\n", session->name, data_len); @@ -262,30 +284,16 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int kfree_skb(skb); } } + rcu_read_unlock(); return; no_sock: - l2tp_info(session, PPPOL2TP_MSG_DATA, "%s: no socket\n", session->name); + rcu_read_unlock(); + l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name); kfree_skb(skb); } -static void pppol2tp_session_sock_hold(struct l2tp_session *session) -{ - struct pppol2tp_session *ps = l2tp_session_priv(session); - - if (ps->sock) - sock_hold(ps->sock); -} - -static void pppol2tp_session_sock_put(struct l2tp_session *session) -{ - struct pppol2tp_session *ps = l2tp_session_priv(session); - - if (ps->sock) - sock_put(ps->sock); -} - /************************************************************************ * Transmit handling ***********************************************************************/ @@ -297,7 +305,6 @@ static void pppol2tp_session_sock_put(struct l2tp_session *session) static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { - static const unsigned char ppph[2] = { 0xff, 0x03 }; struct sock *sk = sock->sk; struct sk_buff *skb; int error; @@ -327,7 +334,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, error = -ENOMEM; skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len + - sizeof(ppph) + total_len, + 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) goto error_put_sess_tun; @@ -340,8 +347,8 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, skb_reserve(skb, uhlen); /* Add PPP header */ - skb->data[0] = ppph[0]; - skb->data[1] = ppph[1]; + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; skb_put(skb, 2); /* Copy user data into skb */ @@ -384,7 +391,6 @@ error: */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { - static const u8 ppph[2] = { 0xff, 0x03 }; struct sock *sk = (struct sock *) chan->private; struct sock *sk_tun; struct l2tp_session *session; @@ -413,14 +419,14 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) sizeof(struct iphdr) + /* IP header */ uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */ session->hdr_len + /* L2TP header */ - sizeof(ppph); /* PPP header */ + 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) goto abort_put_sess_tun; /* Setup PPP header */ - __skb_push(skb, sizeof(ppph)); - skb->data[0] = ppph[0]; - skb->data[1] = ppph[1]; + __skb_push(skb, 2); + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; local_bh_disable(); l2tp_xmit_skb(session, skb, session->hdr_len); @@ -448,16 +454,15 @@ abort: */ static void pppol2tp_session_close(struct l2tp_session *session) { - struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk = ps->sock; - struct socket *sock = sk->sk_socket; + struct sock *sk; BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { - inet_shutdown(sock, 2); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); + sk = pppol2tp_session_get_sock(session); + if (sk) { + if (sk->sk_socket) + inet_shutdown(sk->sk_socket, SEND_SHUTDOWN); + sock_put(sk); } } @@ -478,6 +483,14 @@ static void pppol2tp_session_destruct(struct sock *sk) } } +static void pppol2tp_put_sk(struct rcu_head *head) +{ + struct pppol2tp_session *ps; + + ps = container_of(head, typeof(*ps), rcu); + sock_put(ps->__sk); +} + /* Called when the PPPoX socket (session) is closed. */ static int pppol2tp_release(struct socket *sock) @@ -503,11 +516,23 @@ static int pppol2tp_release(struct socket *sock) session = pppol2tp_sock_to_session(sk); - /* Purge any queued data */ if (session != NULL) { - __l2tp_session_unhash(session); - l2tp_session_queue_purge(session); - sock_put(sk); + struct pppol2tp_session *ps; + + l2tp_session_delete(session); + + ps = l2tp_session_priv(session); + mutex_lock(&ps->sk_lock); + ps->__sk = rcu_dereference_protected(ps->sk, + lockdep_is_held(&ps->sk_lock)); + RCU_INIT_POINTER(ps->sk, NULL); + mutex_unlock(&ps->sk_lock); + call_rcu(&ps->rcu, pppol2tp_put_sk); + + /* Rely on the sock_put() call at the end of the function for + * dropping the reference held by pppol2tp_sock_to_session(). + * The last reference will be dropped by pppol2tp_put_sk(). + */ } release_sock(sk); @@ -574,16 +599,47 @@ out: static void pppol2tp_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; - struct pppol2tp_session *ps = l2tp_session_priv(session); + struct sock *sk; + + sk = pppol2tp_session_get_sock(session); + if (sk) { + struct pppox_sock *po = pppox_sk(sk); - if (ps) { - struct pppox_sock *po = pppox_sk(ps->sock); - if (po) - seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); + seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); + sock_put(sk); } } #endif +static void pppol2tp_session_init(struct l2tp_session *session) +{ + struct pppol2tp_session *ps; + struct dst_entry *dst; + + session->recv_skb = pppol2tp_recv; + session->session_close = pppol2tp_session_close; +#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) + session->show = pppol2tp_show; +#endif + + ps = l2tp_session_priv(session); + mutex_init(&ps->sk_lock); + ps->tunnel_sock = session->tunnel->sock; + ps->owner = current->pid; + + /* If PMTU discovery was enabled, use the MTU that was discovered */ + dst = sk_dst_get(session->tunnel->sock); + if (dst) { + u32 pmtu = dst_mtu(dst); + + if (pmtu) { + session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD; + session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; + } + dst_release(dst); + } +} + /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, @@ -595,11 +651,11 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, struct l2tp_session *session = NULL; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; - struct dst_entry *dst; struct l2tp_session_cfg cfg = { 0, }; int error = 0; u32 tunnel_id, peer_tunnel_id; u32 session_id, peer_session_id; + bool drop_refcnt = false; int ver = 2; int fd; @@ -708,65 +764,53 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = peer_tunnel_id; - /* Create session if it doesn't already exist. We handle the - * case where a session was previously created by the netlink - * interface by checking that the session doesn't already have - * a socket and its tunnel socket are what we expect. If any - * of those checks fail, return EEXIST to the caller. - */ - session = l2tp_session_find(sock_net(sk), tunnel, session_id); - if (session == NULL) { - /* Default MTU must allow space for UDP/L2TP/PPP - * headers. + session = l2tp_session_get(sock_net(sk), tunnel, session_id, false); + if (session) { + drop_refcnt = true; + ps = l2tp_session_priv(session); + + /* Using a pre-existing session is fine as long as it hasn't + * been connected yet. */ - cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD; + mutex_lock(&ps->sk_lock); + if (rcu_dereference_protected(ps->sk, + lockdep_is_held(&ps->sk_lock))) { + mutex_unlock(&ps->sk_lock); + error = -EEXIST; + goto end; + } + + /* consistency checks */ + if (ps->tunnel_sock != tunnel->sock) { + mutex_unlock(&ps->sk_lock); + error = -EEXIST; + goto end; + } + } else { + /* Default MTU must allow space for UDP/L2TP/PPP headers */ + cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; + cfg.mru = cfg.mtu; - /* Allocate and initialize a new session context. */ session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, peer_session_id, &cfg); - if (session == NULL) { - error = -ENOMEM; + if (IS_ERR(session)) { + error = PTR_ERR(session); goto end; } - } else { + + pppol2tp_session_init(session); ps = l2tp_session_priv(session); - error = -EEXIST; - if (ps->sock != NULL) - goto end; + l2tp_session_inc_refcount(session); - /* consistency checks */ - if (ps->tunnel_sock != tunnel->sock) + mutex_lock(&ps->sk_lock); + error = l2tp_session_register(session, tunnel); + if (error < 0) { + mutex_unlock(&ps->sk_lock); + kfree(session); goto end; - } - - /* Associate session with its PPPoL2TP socket */ - ps = l2tp_session_priv(session); - ps->owner = current->pid; - ps->sock = sk; - ps->tunnel_sock = tunnel->sock; - - session->recv_skb = pppol2tp_recv; - session->session_close = pppol2tp_session_close; -#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) - session->show = pppol2tp_show; -#endif - - /* We need to know each time a skb is dropped from the reorder - * queue. - */ - session->ref = pppol2tp_session_sock_hold; - session->deref = pppol2tp_session_sock_put; - - /* If PMTU discovery was enabled, use the MTU that was discovered */ - dst = sk_dst_get(tunnel->sock); - if (dst != NULL) { - u32 pmtu = dst_mtu(dst); - - if (pmtu != 0) - session->mtu = session->mru = pmtu - - PPPOL2TP_HEADER_OVERHEAD; - dst_release(dst); + } + drop_refcnt = true; } /* Special case: if source & dest session_id == 0x0000, this @@ -791,17 +835,30 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, po->chan.mtu = session->mtu; error = ppp_register_net_channel(sock_net(sk), &po->chan); - if (error) + if (error) { + mutex_unlock(&ps->sk_lock); goto end; + } out_no_ppp: /* This is how we get the session context from the socket. */ sk->sk_user_data = session; + rcu_assign_pointer(ps->sk, sk); + mutex_unlock(&ps->sk_lock); + + /* Keep the reference we've grabbed on the session: sk doesn't expect + * the session to disappear. pppol2tp_session_destruct() is responsible + * for dropping it. + */ + drop_refcnt = false; + sk->sk_state = PPPOX_CONNECTED; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n", session->name); end: + if (drop_refcnt) + l2tp_session_dec_refcount(session); release_sock(sk); return error; @@ -809,31 +866,19 @@ end: #ifdef CONFIG_L2TP_V3 -/* Called when creating sessions via the netlink interface. - */ -static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) +/* Called when creating sessions via the netlink interface. */ +static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, + u32 session_id, u32 peer_session_id, + struct l2tp_session_cfg *cfg) { int error; - struct l2tp_tunnel *tunnel; struct l2tp_session *session; - struct pppol2tp_session *ps; - - tunnel = l2tp_tunnel_find(net, tunnel_id); - - /* Error if we can't find the tunnel */ - error = -ENOENT; - if (tunnel == NULL) - goto out; /* Error if tunnel socket is not prepped */ - if (tunnel->sock == NULL) - goto out; - - /* Check that this session doesn't already exist */ - error = -EEXIST; - session = l2tp_session_find(net, tunnel, session_id); - if (session != NULL) - goto out; + if (!tunnel->sock) { + error = -ENOENT; + goto err; + } /* Default MTU values. */ if (cfg->mtu == 0) @@ -842,22 +887,25 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i cfg->mru = cfg->mtu; /* Allocate and initialize a new session context. */ - error = -ENOMEM; session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, peer_session_id, cfg); - if (session == NULL) - goto out; + if (IS_ERR(session)) { + error = PTR_ERR(session); + goto err; + } - ps = l2tp_session_priv(session); - ps->tunnel_sock = tunnel->sock; + pppol2tp_session_init(session); - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n", - session->name); + error = l2tp_session_register(session, tunnel); + if (error < 0) + goto err_sess; - error = 0; + return 0; -out: +err_sess: + kfree(session); +err: return error; } @@ -889,10 +937,8 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, pls = l2tp_session_priv(session); tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock); - if (tunnel == NULL) { - error = -EBADF; + if (tunnel == NULL) goto end_put_sess; - } inet = inet_sk(tunnel->sock); if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) { @@ -970,12 +1016,11 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, } *usockaddr_len = len; + error = 0; sock_put(pls->tunnel_sock); end_put_sess: sock_put(sk); - error = 0; - end: return error; } @@ -1017,16 +1062,14 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, struct l2tp_tunnel *tunnel = session->tunnel; struct pppol2tp_ioc_stats stats; - l2tp_dbg(session, PPPOL2TP_MSG_CONTROL, + l2tp_dbg(session, L2TP_MSG_CONTROL, "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n", session->name, cmd, arg); - sk = ps->sock; + sk = pppol2tp_session_get_sock(session); if (!sk) return -EBADR; - sock_hold(sk); - switch (cmd) { case SIOCGIFMTU: err = -ENXIO; @@ -1040,7 +1083,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq))) break; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mtu=%d\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n", session->name, session->mtu); err = 0; break; @@ -1056,7 +1099,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, session->mtu = ifr.ifr_mtu; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mtu=%d\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n", session->name, session->mtu); err = 0; break; @@ -1070,7 +1113,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, if (put_user(session->mru, (int __user *) arg)) break; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mru=%d\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n", session->name, session->mru); err = 0; break; @@ -1085,7 +1128,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, break; session->mru = val; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mru=%d\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n", session->name, session->mru); err = 0; break; @@ -1095,7 +1138,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, if (put_user(ps->flags, (int __user *) arg)) break; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get flags=%d\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n", session->name, ps->flags); err = 0; break; @@ -1105,7 +1148,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, if (get_user(val, (int __user *) arg)) break; ps->flags = val; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set flags=%d\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n", session->name, ps->flags); err = 0; break; @@ -1122,7 +1165,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) break; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n", session->name); err = 0; break; @@ -1150,7 +1193,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, struct sock *sk; struct pppol2tp_ioc_stats stats; - l2tp_dbg(tunnel, PPPOL2TP_MSG_CONTROL, + l2tp_dbg(tunnel, L2TP_MSG_CONTROL, "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n", tunnel->name, cmd, arg); @@ -1171,11 +1214,18 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, if (stats.session_id != 0) { /* resend to session ioctl handler */ struct l2tp_session *session = - l2tp_session_find(sock_net(sk), tunnel, stats.session_id); - if (session != NULL) - err = pppol2tp_session_ioctl(session, cmd, arg); - else + l2tp_session_get(sock_net(sk), tunnel, + stats.session_id, true); + + if (session) { + err = pppol2tp_session_ioctl(session, cmd, + arg); + if (session->deref) + session->deref(session); + l2tp_session_dec_refcount(session); + } else { err = -EBADR; + } break; } #ifdef CONFIG_XFRM @@ -1186,7 +1236,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, err = -EFAULT; break; } - l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n", + l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get L2TP stats\n", tunnel->name); err = 0; break; @@ -1276,7 +1326,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: tunnel->debug = val; - l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n", + l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n", tunnel->name, tunnel->debug); break; @@ -1295,7 +1345,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk, int optname, int val) { int err = 0; - struct pppol2tp_session *ps = l2tp_session_priv(session); switch (optname) { case PPPOL2TP_SO_RECVSEQ: @@ -1304,7 +1353,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; } session->recv_seq = val ? -1 : 0; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set recv_seq=%d\n", session->name, session->recv_seq); break; @@ -1316,13 +1365,13 @@ static int pppol2tp_session_setsockopt(struct sock *sk, } session->send_seq = val ? -1 : 0; { - struct sock *ssk = ps->sock; - struct pppox_sock *po = pppox_sk(ssk); + struct pppox_sock *po = pppox_sk(sk); + po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } l2tp_session_set_header_len(session, session->tunnel->version); - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set send_seq=%d\n", session->name, session->send_seq); break; @@ -1333,20 +1382,20 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; } session->lns_mode = val ? -1 : 0; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set lns_mode=%d\n", session->name, session->lns_mode); break; case PPPOL2TP_SO_DEBUG: session->debug = val; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n", session->name, session->debug); break; case PPPOL2TP_SO_REORDERTO: session->reorder_timeout = msecs_to_jiffies(val); - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: set reorder_timeout=%d\n", session->name, session->reorder_timeout); break; @@ -1427,7 +1476,7 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: *val = tunnel->debug; - l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get debug=%x\n", + l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n", tunnel->name, tunnel->debug); break; @@ -1450,31 +1499,31 @@ static int pppol2tp_session_getsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_RECVSEQ: *val = session->recv_seq; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get recv_seq=%d\n", session->name, *val); break; case PPPOL2TP_SO_SENDSEQ: *val = session->send_seq; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get send_seq=%d\n", session->name, *val); break; case PPPOL2TP_SO_LNSMODE: *val = session->lns_mode; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get lns_mode=%d\n", session->name, *val); break; case PPPOL2TP_SO_DEBUG: *val = session->debug; - l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get debug=%d\n", + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n", session->name, *val); break; case PPPOL2TP_SO_REORDERTO: *val = (int) jiffies_to_msecs(session->reorder_timeout); - l2tp_info(session, PPPOL2TP_MSG_CONTROL, + l2tp_info(session, L2TP_MSG_CONTROL, "%s: get reorder_timeout=%d\n", session->name, *val); break; @@ -1653,8 +1702,9 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) { struct l2tp_session *session = v; struct l2tp_tunnel *tunnel = session->tunnel; - struct pppol2tp_session *ps = l2tp_session_priv(session); - struct pppox_sock *po = pppox_sk(ps->sock); + unsigned char state; + char user_data_ok; + struct sock *sk; u32 ip = 0; u16 port = 0; @@ -1664,6 +1714,15 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) port = ntohs(inet->inet_sport); } + sk = pppol2tp_session_get_sock(session); + if (sk) { + state = sk->sk_state; + user_data_ok = (session == sk->sk_user_data) ? 'Y' : 'N'; + } else { + state = 0; + user_data_ok = 'N'; + } + seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> " "%04X/%04X %d %c\n", session->name, ip, port, @@ -1671,9 +1730,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->session_id, tunnel->peer_tunnel_id, session->peer_session_id, - ps->sock->sk_state, - (session == ps->sock->sk_user_data) ? - 'Y' : 'N'); + state, user_data_ok); seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n", session->mtu, session->mru, session->recv_seq ? 'R' : '-', @@ -1690,8 +1747,12 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) atomic_long_read(&session->stats.rx_bytes), atomic_long_read(&session->stats.rx_errors)); - if (po) + if (sk) { + struct pppox_sock *po = pppox_sk(sk); + seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); + sock_put(sk); + } } static int pppol2tp_seq_show(struct seq_file *m, void *v) diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c index ba4d015bd1a6..7cbb77b7479a 100644 --- a/net/lapb/lapb_out.c +++ b/net/lapb/lapb_out.c @@ -87,7 +87,8 @@ void lapb_kick(struct lapb_cb *lapb) skb = skb_dequeue(&lapb->write_queue); do { - if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skbn = skb_copy(skb, GFP_ATOMIC); + if (!skbn) { skb_queue_head(&lapb->write_queue, skb); break; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c153fc2883a8..82b07bc43071 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -96,8 +96,16 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; - if (addr->sllc_test || addr->sllc_xid) + if (addr->sllc_test) rc = LLC_PDU_LEN_U; + else if (addr->sllc_xid) + /* We need to expand header to sizeof(struct llc_xid_info) + * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header + * as XID PDU. In llc_ui_sendmsg() we reserved header size and then + * filled all other space with user data. If we won't reserve this + * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data + */ + rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; @@ -111,22 +119,26 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. + * + * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); - int rc = 0; if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); + int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); + if (rc) { + kfree_skb(skb); + return rc; + } } - if (unlikely(!rc)) - rc = llc_build_and_send_pkt(sk, skb); - return rc; + return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) @@ -267,6 +279,10 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) if (!sock_flag(sk, SOCK_ZAPPED)) goto out; + if (!addr->sllc_arphrd) + addr->sllc_arphrd = ARPHRD_ETHER; + if (addr->sllc_arphrd != ARPHRD_ETHER) + goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); @@ -324,15 +340,15 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; - if (unlikely(addr->sllc_family != AF_LLC)) + if (!addr->sllc_arphrd) + addr->sllc_arphrd = ARPHRD_ETHER; + if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER)) goto out; rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (llc->dev) { - if (!addr->sllc_arphrd) - addr->sllc_arphrd = llc->dev->type; if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); @@ -896,7 +912,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; + struct sk_buff *skb = NULL; size_t size = 0; int rc = -EINVAL, copied = 0, hdrlen; @@ -905,10 +921,10 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) - goto release; + goto out; } else { if (llc_ui_addr_null(&llc->addr)) - goto release; + goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ @@ -916,7 +932,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) - goto release; + goto out; } hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); size = hdrlen + len; @@ -925,12 +941,12 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) copied = size - hdrlen; rc = -EINVAL; if (copied < 0) - goto release; + goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) - goto release; + goto out; skb->dev = llc->dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hdrlen); @@ -940,29 +956,31 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); + skb = NULL; out: - if (rc) { - kfree_skb(skb); -release: + kfree_skb(skb); + if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); - } release_sock(sk); return rc ? : copied; } diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index d861b74ad068..3b002ab68b29 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -55,6 +55,8 @@ int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; * (executing it's actions and changing state), upper layer will be * indicated or confirmed, if needed. Returns 0 for success, 1 for * failure. The socket lock has to be held before calling this function. + * + * This function always consumes a reference to the skb. */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) { @@ -62,12 +64,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(skb->sk); struct llc_conn_state_ev *ev = llc_conn_ev(skb); - /* - * We have to hold the skb, because llc_conn_service will kfree it in - * the sending path and we need to look at the skb->cb, where we encode - * llc_conn_state_ev. - */ - skb_get(skb); ev->ind_prim = ev->cfm_prim = 0; /* * Send event to state machine @@ -75,21 +71,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) rc = llc_conn_service(skb->sk, skb); if (unlikely(rc != 0)) { printk(KERN_ERR "%s: llc_conn_service failed\n", __func__); - goto out_kfree_skb; - } - - if (unlikely(!ev->ind_prim && !ev->cfm_prim)) { - /* indicate or confirm not required */ - if (!skb->next) - goto out_kfree_skb; goto out_skb_put; } - if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */ - skb_get(skb); - switch (ev->ind_prim) { case LLC_DATA_PRIM: + skb_get(skb); llc_save_primitive(sk, skb, LLC_DATA_PRIM); if (unlikely(sock_queue_rcv_skb(sk, skb))) { /* @@ -106,6 +93,7 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * skb->sk pointing to the newly created struct sock in * llc_conn_handler. -acme */ + skb_get(skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); break; @@ -121,7 +109,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) sk->sk_state_change(sk); } } - kfree_skb(skb); sock_put(sk); break; case LLC_RESET_PRIM: @@ -130,14 +117,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset ind!\n", __func__); - kfree_skb(skb); break; default: - if (ev->ind_prim) { + if (ev->ind_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->ind_prim); - kfree_skb(skb); - } /* No indication */ break; } @@ -179,15 +163,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) printk(KERN_INFO "%s: received a reset conf!\n", __func__); break; default: - if (ev->cfm_prim) { + if (ev->cfm_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->cfm_prim); - break; - } - goto out_skb_put; /* No confirmation */ + /* No confirmation */ + break; } -out_kfree_skb: - kfree_skb(skb); out_skb_put: kfree_skb(skb); return rc; diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 6daf391b3e84..fc4d2bd8816f 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -38,6 +38,8 @@ * closed and -EBUSY when sending data is not permitted in this state or * LLC has send an I pdu with p bit set to 1 and is waiting for it's * response. + * + * This function always consumes a reference to the skb. */ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) { @@ -46,20 +48,22 @@ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); if (unlikely(llc->state == LLC_CONN_STATE_ADM)) - goto out; + goto out_free; rc = -EBUSY; if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */ llc->p_flag)) { llc->failed_data_req = 1; - goto out; + goto out_free; } ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DATA_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb->dev = llc->dev; - rc = llc_conn_state_process(sk, skb); -out: + return llc_conn_state_process(sk, skb); + +out_free: + kfree_skb(skb); return rc; } diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index 7ae4cc684d3a..9fa3342c7a82 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -79,7 +79,7 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) struct llc_sap_state_ev *ev = llc_sap_ev(skb); int rc; - llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + llc_pdu_header_init(skb, LLC_PDU_TYPE_U_XID, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 204a8351efff..c29170e767a8 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -32,7 +32,7 @@ static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb) return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID && - !pdu->dsap ? 0 : 1; /* NULL DSAP value */ + !pdu->dsap; /* NULL DSAP value */ } static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) @@ -42,7 +42,7 @@ static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST && - !pdu->dsap ? 0 : 1; /* NULL DSAP */ + !pdu->dsap; /* NULL DSAP */ } static int llc_station_ac_send_xid_r(struct sk_buff *skb) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 4932e9f243a2..3d49ffe8a34d 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -109,7 +109,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.addba_req.start_seq_num = cpu_to_le16(start_seq_num << 4); - ieee80211_tx_skb(sdata, skb); + ieee80211_tx_skb_tid(sdata, skb, tid); } void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1999a7eaa692..c5be6bf2f00d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -875,50 +875,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) return 0; } -/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ -struct iapp_layer2_update { - u8 da[ETH_ALEN]; /* broadcast */ - u8 sa[ETH_ALEN]; /* STA addr */ - __be16 len; /* 6 */ - u8 dsap; /* 0 */ - u8 ssap; /* 0 */ - u8 control; - u8 xid_info[3]; -} __packed; - -static void ieee80211_send_layer2_update(struct sta_info *sta) -{ - struct iapp_layer2_update *msg; - struct sk_buff *skb; - - /* Send Level 2 Update Frame to update forwarding tables in layer 2 - * bridge devices */ - - skb = dev_alloc_skb(sizeof(*msg)); - if (!skb) - return; - msg = (struct iapp_layer2_update *)skb_put(skb, sizeof(*msg)); - - /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) - * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ - - eth_broadcast_addr(msg->da); - memcpy(msg->sa, sta->sta.addr, ETH_ALEN); - msg->len = htons(6); - msg->dsap = 0; - msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ - msg->control = 0xaf; /* XID response lsb.1111F101. - * F=0 (no poll command; unsolicited frame) */ - msg->xid_info[0] = 0x81; /* XID format identifier */ - msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ - msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ - - skb->dev = sta->sdata->dev; - skb->protocol = eth_type_trans(skb, sta->sdata->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - netif_rx_ni(skb); -} - static int sta_apply_auth_flags(struct ieee80211_local *local, struct sta_info *sta, u32 mask, u32 set) @@ -1045,7 +1001,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, int ret = 0; struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata = sta->sdata; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); u32 mask, set; sband = local->hw.wiphy->bands[band]; @@ -1194,7 +1150,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, struct sta_info *sta; struct ieee80211_sub_if_data *sdata; int err; - int layer2_update; if (params->vlan) { sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); @@ -1248,18 +1203,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init(sta); - layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_AP; - err = sta_info_insert_rcu(sta); if (err) { rcu_read_unlock(); return err; } - if (layer2_update) - ieee80211_send_layer2_update(sta); - rcu_read_unlock(); return 0; @@ -1367,7 +1316,9 @@ static int ieee80211_change_station(struct wiphy *wiphy, atomic_inc(&sta->sdata->bss->num_mcast_sta); } - ieee80211_send_layer2_update(sta); + if (sta->sta_state == IEEE80211_STA_AUTHORIZED) + cfg80211_send_layer2_update(sta->sdata->dev, + sta->sta.addr); } err = sta_apply_parameters(local, sta, params); @@ -1815,6 +1766,7 @@ static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) ieee80211_stop_mesh(sdata); mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); + kfree(sdata->u.mesh.ie); mutex_unlock(&sdata->local->mtx); return 0; @@ -1826,7 +1778,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - enum ieee80211_band band; + enum nl80211_band band; u32 changed = 0; if (!sdata_dereference(sdata->u.ap.beacon, sdata)) @@ -1845,7 +1797,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (!sdata->vif.bss_conf.use_short_slot && - band == IEEE80211_BAND_5GHZ) { + band == NL80211_BAND_5GHZ) { sdata->vif.bss_conf.use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } @@ -2069,12 +2021,12 @@ static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev) } static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, - int rate[IEEE80211_NUM_BANDS]) + int rate[NUM_NL80211_BANDS]) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(sdata->vif.bss_conf.mcast_rate, rate, - sizeof(int) * IEEE80211_NUM_BANDS); + sizeof(int) * NUM_NL80211_BANDS); return 0; } @@ -2479,7 +2431,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return ret; } - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; @@ -2496,14 +2448,14 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { - if (~sdata->rc_rateidx_mcs_mask[i][j]) { + if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) { sdata->rc_has_mcs_mask[i] = true; break; } } for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { - if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) { + if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) { sdata->rc_has_vht_mcs_mask[i] = true; break; } @@ -3580,7 +3532,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_tx_info *info; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; - enum ieee80211_band band; + enum nl80211_band band; int ret; /* the lock is needed to assign the cookie later */ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e546a987a9d3..3e24d0ddb51b 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -125,6 +125,7 @@ static const char *hw_flag_names[] = { FLAG(TDLS_WIDER_BW), FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), + FLAG(NEEDS_UNIQUE_STA_ADDR), #undef FLAG }; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 37ea30e0754c..a5ba739cd2a7 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -169,21 +169,21 @@ static ssize_t ieee80211_if_write_##name(struct file *file, \ IEEE80211_IF_FILE_R(name) /* common attributes */ -IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[IEEE80211_BAND_2GHZ], +IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[NL80211_BAND_2GHZ], HEX); -IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[IEEE80211_BAND_5GHZ], +IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[NL80211_BAND_5GHZ], HEX); IEEE80211_IF_FILE(rc_rateidx_mcs_mask_2ghz, - rc_rateidx_mcs_mask[IEEE80211_BAND_2GHZ], HEXARRAY); + rc_rateidx_mcs_mask[NL80211_BAND_2GHZ], HEXARRAY); IEEE80211_IF_FILE(rc_rateidx_mcs_mask_5ghz, - rc_rateidx_mcs_mask[IEEE80211_BAND_5GHZ], HEXARRAY); + rc_rateidx_mcs_mask[NL80211_BAND_5GHZ], HEXARRAY); static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_2ghz( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) { int i, len = 0; - const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_2GHZ]; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[NL80211_BAND_2GHZ]; for (i = 0; i < NL80211_VHT_NSS_MAX; i++) len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); @@ -199,7 +199,7 @@ static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_5ghz( char *buf, int buflen) { int i, len = 0; - const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_5GHZ]; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[NL80211_BAND_5GHZ]; for (i = 0; i < NL80211_VHT_NSS_MAX; i++) len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index df2e4e311217..5d097ae26b70 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -128,8 +128,11 @@ int drv_sta_state(struct ieee80211_local *local, } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { ret = drv_sta_add(local, sdata, &sta->sta); - if (ret == 0) + if (ret == 0) { sta->uploaded = true; + if (rcu_access_pointer(sta->sta.rates)) + drv_sta_rate_tbl_update(local, sdata, &sta->sta); + } } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTH) { drv_sta_remove(local, sdata, &sta->sta); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 0a35dd68f1a1..95fcf57e6567 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -128,7 +128,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, } } - if (sband->band == IEEE80211_BAND_2GHZ) { + if (sband->band == NL80211_BAND_2GHZ) { *pos++ = WLAN_EID_DS_PARAMS; *pos++ = 1; *pos++ = ieee80211_frequency_to_channel( @@ -350,11 +350,11 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, * * HT follows these specifications (IEEE 802.11-2012 20.3.18) */ - sdata->vif.bss_conf.use_short_slot = chan->band == IEEE80211_BAND_5GHZ; + sdata->vif.bss_conf.use_short_slot = chan->band == NL80211_BAND_5GHZ; bss_change |= BSS_CHANGED_ERP_SLOT; /* cf. IEEE 802.11 9.2.12 */ - if (chan->band == IEEE80211_BAND_2GHZ && have_higher_than_11mbit) + if (chan->band == NL80211_BAND_2GHZ && have_higher_than_11mbit) sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; else sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; @@ -991,7 +991,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel) { struct sta_info *sta; - enum ieee80211_band band = rx_status->band; + enum nl80211_band band = rx_status->band; enum nl80211_bss_scan_width scan_width; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; @@ -1113,7 +1113,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel; u64 beacon_timestamp, rx_timestamp; u32 supp_rates = 0; - enum ieee80211_band band = rx_status->band; + enum nl80211_band band = rx_status->band; channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); if (!channel) @@ -1860,6 +1860,8 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) /* remove beacon */ kfree(sdata->u.ibss.ie); + sdata->u.ibss.ie = NULL; + sdata->u.ibss.ie_len = 0; /* on the next join, re-program HT parameters */ memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa)); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2527294f96c8..3a91f32d1eda 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -51,12 +51,6 @@ struct ieee80211_local; #define IEEE80211_ENCRYPT_HEADROOM 8 #define IEEE80211_ENCRYPT_TAILROOM 18 -/* IEEE 802.11 (Ch. 9.5 Defragmentation) requires support for concurrent - * reception of at least three fragmented frames. This limit can be increased - * by changing this define, at the cost of slower frame reassembly and - * increased memory use (about 2 kB of RAM per entry). */ -#define IEEE80211_FRAGMENT_MAX 4 - /* power level hasn't been configured (or set to automatic) */ #define IEEE80211_UNSET_POWER_LEVEL INT_MIN @@ -85,18 +79,6 @@ struct ieee80211_local; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) -struct ieee80211_fragment_entry { - struct sk_buff_head skb_list; - unsigned long first_frag_time; - u16 seq; - u16 extra_len; - u16 last_frag; - u8 rx_queue; - bool check_sequential_pn; /* needed for CCMP/GCMP */ - u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ -}; - - struct ieee80211_bss { u32 device_ts_beacon, device_ts_presp; @@ -236,8 +218,15 @@ struct ieee80211_rx_data { */ int security_idx; - u32 tkip_iv32; - u16 tkip_iv16; + union { + struct { + u32 iv32; + u16 iv16; + } tkip; + struct { + u8 pn[IEEE80211_CCMP_PN_LEN]; + } ccm_gcm; + }; }; struct ieee80211_csa_settings { @@ -835,9 +824,7 @@ struct ieee80211_sub_if_data { char name[IFNAMSIZ]; - /* Fragment table for host-based reassembly */ - struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX]; - unsigned int fragment_next; + struct ieee80211_fragment_cache frags; /* TID bitmap for NoAck policy */ u16 noack_map; @@ -895,13 +882,13 @@ struct ieee80211_sub_if_data { struct ieee80211_if_ap *bss; /* bitmap of allowed (non-MCS) rate indexes for rate control */ - u32 rc_rateidx_mask[IEEE80211_NUM_BANDS]; + u32 rc_rateidx_mask[NUM_NL80211_BANDS]; - bool rc_has_mcs_mask[IEEE80211_NUM_BANDS]; - u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN]; + bool rc_has_mcs_mask[NUM_NL80211_BANDS]; + u8 rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN]; - bool rc_has_vht_mcs_mask[IEEE80211_NUM_BANDS]; - u16 rc_rateidx_vht_mcs_mask[IEEE80211_NUM_BANDS][NL80211_VHT_NSS_MAX]; + bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; + u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; union { struct ieee80211_if_ap ap; @@ -956,10 +943,10 @@ sdata_assert_lock(struct ieee80211_sub_if_data *sdata) lockdep_assert_held(&sdata->wdev.mtx); } -static inline enum ieee80211_band +static inline enum nl80211_band ieee80211_get_sdata_band(struct ieee80211_sub_if_data *sdata) { - enum ieee80211_band band = IEEE80211_BAND_2GHZ; + enum nl80211_band band = NL80211_BAND_2GHZ; struct ieee80211_chanctx_conf *chanctx_conf; rcu_read_lock(); @@ -1028,6 +1015,7 @@ enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_FLUSH, IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID, + IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE, IEEE80211_QUEUE_STOP_REASONS, }; @@ -1230,7 +1218,7 @@ struct ieee80211_local { struct cfg80211_scan_request __rcu *scan_req; struct ieee80211_scan_request *hw_scan_req; struct cfg80211_chan_def scan_chandef; - enum ieee80211_band hw_scan_band; + enum nl80211_band hw_scan_band; int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; @@ -1707,12 +1695,16 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); void ieee80211_sta_set_rx_nss(struct sta_info *sta); +enum ieee80211_sta_rx_bandwidth +ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width); +enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta); +void ieee80211_sta_set_rx_nss(struct sta_info *sta); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band); + enum nl80211_band band); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band); + enum nl80211_band band); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, @@ -1740,7 +1732,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, */ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band current_band, + enum nl80211_band current_band, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie); @@ -1765,7 +1757,7 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw) /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ -int ieee80211_frame_duration(enum ieee80211_band band, size_t len, +int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift); void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, @@ -1775,12 +1767,12 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum ieee80211_band band); + enum nl80211_band band); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum ieee80211_band band) + enum nl80211_band band) { rcu_read_lock(); __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); @@ -1949,7 +1941,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band band, u32 *basic_rates); + enum nl80211_band band, u32 *basic_rates); int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode); int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata, @@ -1972,10 +1964,10 @@ int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const u8 *srates, int srates_len, u32 *rates); int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band); + enum nl80211_band band); int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band); + enum nl80211_band band); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ @@ -2072,4 +2064,7 @@ extern const struct ethtool_ops ieee80211_ethtool_ops; #define debug_noinline #endif +void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache); +void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache); + #endif /* IEEE80211_I_H */ diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 3678f2d5fcfe..dceaad91c1e0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1083,16 +1083,12 @@ static void ieee80211_set_multicast_list(struct net_device *dev) */ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata) { - int i; - /* free extra data */ ieee80211_free_keys(sdata, false); ieee80211_debugfs_remove_netdev(sdata); - for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) - __skb_queue_purge(&sdata->fragments[i].skb_list); - sdata->fragment_next = 0; + ieee80211_destroy_frag_cache(&sdata->frags); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_rmc_free(sdata); @@ -1508,6 +1504,10 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, if (ret) return ret; + ieee80211_stop_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE); + synchronize_net(); + ieee80211_do_stop(sdata, false); ieee80211_teardown_sdata(sdata); @@ -1528,6 +1528,8 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, err = ieee80211_do_open(&sdata->wdev, false); WARN(err, "type change: do_open returned %d", err); + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE); return ret; } @@ -1782,8 +1784,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sdata->wdev.wiphy = local->hw.wiphy; sdata->local = local; - for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) - skb_queue_head_init(&sdata->fragments[i].skb_list); + ieee80211_init_frag_cache(&sdata->frags); INIT_LIST_HEAD(&sdata->key_list); @@ -1792,7 +1793,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, INIT_DELAYED_WORK(&sdata->dec_tailroom_needed_wk, ieee80211_delayed_tailroom_dec); - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[i]; sdata->rc_rateidx_mask[i] = diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 91a4e606edcd..a2050d5776ce 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -646,6 +646,7 @@ int ieee80211_key_link(struct ieee80211_key *key, struct sta_info *sta) { struct ieee80211_local *local = sdata->local; + static atomic_t key_color = ATOMIC_INIT(0); struct ieee80211_key *old_key; int idx = key->conf.keyidx; bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; @@ -680,6 +681,12 @@ int ieee80211_key_link(struct ieee80211_key *key, key->sdata = sdata; key->sta = sta; + /* + * Assign a unique ID to every key so we can easily prevent mixed + * key and fragment cache attacks. + */ + key->color = atomic_inc_return(&key_color); + increment_tailroom_need_count(sdata); ieee80211_key_replace(sdata, sta, pairwise, old_key, key); diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 9951ef06323e..9ac5c00dbe80 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -123,6 +123,8 @@ struct ieee80211_key { } debugfs; #endif + unsigned int color; + /* * key config, must be last because it contains key * material as variable length member diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 15d23aeea634..3752e43ef41b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -808,7 +808,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); int result, i; - enum ieee80211_band band; + enum nl80211_band band; int channels, max_bitrates; bool supp_ht, supp_vht; netdev_features_t feature_whitelist; @@ -881,7 +881,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = 0; supp_ht = false; supp_vht = false; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[band]; @@ -889,8 +889,19 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) continue; if (!dflt_chandef.chan) { + /* + * Assign the first enabled channel to dflt_chandef + * from the list of channels + */ + for (i = 0; i < sband->n_channels; i++) + if (!(sband->channels[i].flags & + IEEE80211_CHAN_DISABLED)) + break; + /* if none found then use the first anyway */ + if (i == sband->n_channels) + i = 0; cfg80211_chandef_create(&dflt_chandef, - &sband->channels[0], + &sband->channels[i], NL80211_CHAN_NO_HT); /* init channel we're on */ if (!local->use_chanctx && !local->_oper_chandef.chan) { @@ -948,7 +959,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!local->int_scan_req) return -ENOMEM; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band]) continue; local->int_scan_req->rates[band] = (u32) -1; @@ -1008,8 +1019,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (local->hw.wiphy->max_scan_ie_len) local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len; - WARN_ON(!ieee80211_cs_list_valid(local->hw.cipher_schemes, - local->hw.n_cipher_schemes)); + if (WARN_ON(!ieee80211_cs_list_valid(local->hw.cipher_schemes, + local->hw.n_cipher_schemes))) { + result = -EINVAL; + goto fail_workqueue; + } result = ieee80211_init_cipher_suites(local); if (result < 0) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 4bd8f3f056d8..8d33125518de 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -416,7 +416,7 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u8 *pos; @@ -479,7 +479,7 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u8 *pos; @@ -681,7 +681,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) struct ieee80211_mgmt *mgmt; struct ieee80211_chanctx_conf *chanctx_conf; struct mesh_csa_settings *csa; - enum ieee80211_band band; + enum nl80211_band band; u8 *pos; struct ieee80211_sub_if_data *sdata; int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) + @@ -930,7 +930,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings params; struct ieee80211_csa_ie csa_ie; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); int err; u32 sta_flags; @@ -1084,7 +1084,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel; size_t baselen; int freq; - enum ieee80211_band band = rx_status->band; + enum nl80211_band band = rx_status->band; /* ignore ProbeResp to foreign address */ if (stype == IEEE80211_STYPE_PROBE_RESP && @@ -1114,8 +1114,15 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; - if (mesh_matches_local(sdata, &elems)) - mesh_neighbour_update(sdata, mgmt->sa, &elems); + if (mesh_matches_local(sdata, &elems)) { + mpl_dbg(sdata, "rssi_threshold=%d,rx_status->signal=%d\n", + sdata->u.mesh.mshcfg.rssi_threshold, rx_status->signal); + if (!sdata->u.mesh.user_mpm || + sdata->u.mesh.mshcfg.rssi_threshold == 0 || + sdata->u.mesh.mshcfg.rssi_threshold < rx_status->signal) + mesh_neighbour_update(sdata, mgmt->sa, &elems, + rx_status); + } if (ifmsh->sync_ops) ifmsh->sync_ops->rx_bcn_presp(sdata, diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 4a8019f79fb2..7274e6719e8b 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -289,7 +289,8 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata); /* Mesh plinks */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, - u8 *hw_addr, struct ieee802_11_elems *ie); + u8 *hw_addr, struct ieee802_11_elems *ie, + struct ieee80211_rx_status *rx_status); bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); void mesh_plink_broken(struct sta_info *sta); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 466922f09d04..43edcba6d67b 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1112,7 +1112,8 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, } } - if (!(mpath->flags & MESH_PATH_RESOLVING)) + if (!(mpath->flags & MESH_PATH_RESOLVING) && + mesh_path_sel_is_hwmp(sdata)) mesh_queue_preq(mpath, PREQ_Q_F_START); if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 9f02e54ad2a5..51b5d98f6c5d 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -93,18 +93,18 @@ static inline void mesh_plink_fsm_restart(struct sta_info *sta) static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; struct sta_info *sta; u32 erp_rates = 0, changed = 0; int i; bool short_slot = false; - if (band == IEEE80211_BAND_5GHZ) { + if (band == NL80211_BAND_5GHZ) { /* (IEEE 802.11-2012 19.4.5) */ short_slot = true; goto out; - } else if (band != IEEE80211_BAND_2GHZ) + } else if (band != NL80211_BAND_2GHZ) goto out; for (i = 0; i < sband->n_bitrates; i++) @@ -247,7 +247,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.self_prot.action_code = action; if (action != WLAN_SP_MESH_PEERING_CLOSE) { - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); /* capability info */ pos = skb_put(skb, 2); @@ -383,7 +383,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, bool insert) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u32 rates, basic_rates = 0, changed = 0; enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; @@ -489,7 +489,8 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) static struct sta_info * mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, - struct ieee802_11_elems *elems) + struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) { struct sta_info *sta = NULL; @@ -497,11 +498,17 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, if (sdata->u.mesh.user_mpm || sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) { if (mesh_peer_accepts_plinks(elems) && - mesh_plink_availables(sdata)) + mesh_plink_availables(sdata)) { + int sig = 0; + + if (ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) + sig = rx_status->signal; + cfg80211_notify_new_peer_candidate(sdata->dev, addr, elems->ie_start, elems->total_len, - GFP_KERNEL); + sig, GFP_KERNEL); + } } else sta = __mesh_sta_info_alloc(sdata, addr); @@ -514,13 +521,15 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, * @sdata: local meshif * @addr: peer's address * @elems: IEs from beacon or mesh peering frame. + * @rx_status: rx status for the frame for signal reporting * * Return existing or newly allocated sta_info under RCU read lock. * (re)initialize with given IEs. */ static struct sta_info * mesh_sta_info_get(struct ieee80211_sub_if_data *sdata, - u8 *addr, struct ieee802_11_elems *elems) __acquires(RCU) + u8 *addr, struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) __acquires(RCU) { struct sta_info *sta = NULL; @@ -531,7 +540,7 @@ mesh_sta_info_get(struct ieee80211_sub_if_data *sdata, } else { rcu_read_unlock(); /* can't run atomic */ - sta = mesh_sta_info_alloc(sdata, addr, elems); + sta = mesh_sta_info_alloc(sdata, addr, elems, rx_status); if (!sta) { rcu_read_lock(); return NULL; @@ -552,17 +561,19 @@ mesh_sta_info_get(struct ieee80211_sub_if_data *sdata, * @sdata: local meshif * @addr: peer's address * @elems: IEs from beacon or mesh peering frame + * @rx_status: rx status for the frame for signal reporting * * Initiates peering if appropriate. */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, u8 *hw_addr, - struct ieee802_11_elems *elems) + struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) { struct sta_info *sta; u32 changed = 0; - sta = mesh_sta_info_get(sdata, hw_addr, elems); + sta = mesh_sta_info_get(sdata, hw_addr, elems, rx_status); if (!sta) goto out; @@ -1044,7 +1055,8 @@ out: static void mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - struct ieee802_11_elems *elems) + struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) { struct sta_info *sta; @@ -1109,7 +1121,7 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, if (event == OPN_ACPT) { rcu_read_unlock(); /* allocate sta entry if necessary and update info */ - sta = mesh_sta_info_get(sdata, mgmt->sa, elems); + sta = mesh_sta_info_get(sdata, mgmt->sa, elems, rx_status); if (!sta) { mpl_dbg(sdata, "Mesh plink: failed to init peer!\n"); goto unlock_rcu; @@ -1175,5 +1187,5 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, return; } ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems); - mesh_process_plink_frame(sdata, mgmt, &elems); + mesh_process_plink_frame(sdata, mgmt, &elems, rx_status); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1e6dd3de116b..bae631cf549a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -693,7 +693,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) capab = WLAN_CAPABILITY_ESS; - if (sband->band == IEEE80211_BAND_2GHZ) { + if (sband->band == NL80211_BAND_2GHZ) { capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } @@ -1133,6 +1133,11 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata) sdata->vif.csa_active = false; ifmgd->csa_waiting_bcn = false; + /* + * If the CSA IE is still present on the beacon after the switch, + * we need to consider it as a new CSA (possibly to self). + */ + ifmgd->beacon_crc_valid = false; ret = drv_post_channel_switch(sdata); if (ret) { @@ -1182,7 +1187,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss = ifmgd->associated; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; - enum ieee80211_band current_band; + enum nl80211_band current_band; struct ieee80211_csa_ie csa_ie; struct ieee80211_channel_switch ch_switch; int res; @@ -1339,11 +1344,11 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, default: WARN_ON_ONCE(1); /* fall through */ - case IEEE80211_BAND_2GHZ: + case NL80211_BAND_2GHZ: case IEEE80211_BAND_60GHZ: chan_increment = 1; break; - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: chan_increment = 4; break; } @@ -1955,7 +1960,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); - if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_5GHZ) + if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_5GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { @@ -2283,7 +2288,7 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, if (!ieee80211_is_data(hdr->frame_control)) return; - if (ieee80211_is_nullfunc(hdr->frame_control) && + if (ieee80211_is_any_nullfunc(hdr->frame_control) && sdata->u.mgd.probe_send_count > 0) { if (ack) ieee80211_sta_reset_conn_monitor(sdata); @@ -4426,7 +4431,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, if (new_sta) { u32 rates = 0, basic_rates = 0; - bool have_higher_than_11mbit; + bool have_higher_than_11mbit = false; int min_rate = INT_MAX, min_rate_index = -1; struct ieee80211_chanctx_conf *chanctx_conf; const struct cfg80211_bss_ies *ies; @@ -4468,7 +4473,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ - if (cbss->channel->band == IEEE80211_BAND_2GHZ && + if (cbss->channel->band == NL80211_BAND_2GHZ && have_higher_than_11mbit) sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; else diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index b6be51940ead..af489405d5b3 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -308,11 +308,10 @@ void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free) /* was never transmitted */ if (roc->frame) { - cfg80211_mgmt_tx_status(&roc->sdata->wdev, - (unsigned long)roc->frame, + cfg80211_mgmt_tx_status(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->frame->data, roc->frame->len, false, GFP_KERNEL); - kfree_skb(roc->frame); + ieee80211_free_txskb(&roc->sdata->local->hw, roc->frame); } if (!roc->mgmt_tx_cookie) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 24033c81f3d0..d9756da625f0 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -289,7 +289,7 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, u32 rate_flags = ieee80211_chandef_rate_flags(&hw->conf.chandef); - if ((sband->band == IEEE80211_BAND_2GHZ) && + if ((sband->band == NL80211_BAND_2GHZ) && (info->flags & IEEE80211_TX_CTL_NO_CCK_RATE)) rate_flags |= IEEE80211_RATE_ERP_G; @@ -890,7 +890,8 @@ int rate_control_set_rates(struct ieee80211_hw *hw, if (old) kfree_rcu(old, rcu_head); - drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); + if (sta->uploaded) + drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); return 0; } diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index b54f398cda5d..e91386919399 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -274,7 +274,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, success = !!(info->flags & IEEE80211_TX_STAT_ACK); for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { - if (ar[i].idx < 0) + if (ar[i].idx < 0 || !ar[i].count) break; ndx = rix_to_ndx(mi, ar[i].idx); @@ -287,12 +287,6 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, mi->r[ndx].stats.success += success; } - if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0)) - mi->sample_packets++; - - if (mi->sample_deferred > 0) - mi->sample_deferred--; - if (time_after(jiffies, mi->last_stats_update + (mp->update_interval * HZ) / 1000)) minstrel_update_stats(mp, mi); @@ -366,7 +360,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, #endif delta = (mi->total_packets * sampling_ratio / 100) - - (mi->sample_packets + mi->sample_deferred / 2); + mi->sample_packets; /* delta < 0: no sampling required */ prev_sample = mi->prev_sample; @@ -375,7 +369,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, return; if (mi->total_packets >= 10000) { - mi->sample_deferred = 0; mi->sample_packets = 0; mi->total_packets = 0; } else if (delta > mi->n_rates * 2) { @@ -400,19 +393,8 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * rate sampling method should be used. * Respect such rates that are not sampled for 20 interations. */ - if (mrr_capable && - msr->perfect_tx_time > mr->perfect_tx_time && - msr->stats.sample_skipped < 20) { - /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark - * packets that have the sampling rate deferred to the - * second MRR stage. Increase the sample counter only - * if the deferred sample rate was actually used. - * Use the sample_deferred counter to make sure that - * the sampling is not done in large bursts */ - info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; - rate++; - mi->sample_deferred++; - } else { + if (msr->perfect_tx_time < mr->perfect_tx_time || + msr->stats.sample_skipped >= 20) { if (!msr->sample_limit) return; @@ -432,11 +414,12 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, rate->idx = mi->r[ndx].rix; rate->count = minstrel_get_retry_count(&mi->r[ndx], info); + info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; } static void -calc_rate_durations(enum ieee80211_band band, +calc_rate_durations(enum nl80211_band band, struct minstrel_rate *d, struct ieee80211_rate *rate, struct cfg80211_chan_def *chandef) @@ -579,7 +562,7 @@ minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) if (!mi) return NULL; - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = hw->wiphy->bands[i]; if (sband && sband->n_bitrates > max_rates) max_rates = sband->n_bitrates; @@ -621,7 +604,7 @@ minstrel_init_cck_rates(struct minstrel_priv *mp) u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); int i, j; - sband = mp->hw->wiphy->bands[IEEE80211_BAND_2GHZ]; + sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; if (!sband) return; diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index c230bbe93262..5a9e44f4fba4 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -105,7 +105,6 @@ struct minstrel_sta_info { u8 max_prob_rate; unsigned int total_packets; unsigned int sample_packets; - int sample_deferred; unsigned int sample_row; unsigned int sample_column; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index ff3b28e7dbce..c15427895d64 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -546,7 +546,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) /* (re)Initialize group rate indexes */ for(j = 0; j < MAX_THR_RATES; j++) - tmp_group_tp_rate[j] = group; + tmp_group_tp_rate[j] = MCS_GROUP_RATES * group; for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mg->supported & BIT(i))) @@ -1073,7 +1073,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, { int i; - if (sband->band != IEEE80211_BAND_2GHZ) + if (sband->band != NL80211_BAND_2GHZ) return; if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) @@ -1271,7 +1271,7 @@ minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) int max_rates = 0; int i; - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = hw->wiphy->bands[i]; if (sband && sband->n_bitrates > max_rates) max_rates = sband->n_bitrates; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6676b413d049..7c05981bdee9 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -321,7 +321,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, else if (status->flag & RX_FLAG_5MHZ) channel_flags |= IEEE80211_CHAN_QUARTER; - if (status->band == IEEE80211_BAND_5GHZ) + if (status->band == NL80211_BAND_5GHZ) channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ; else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ; @@ -1111,8 +1111,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (ieee80211_is_ctl(hdr->frame_control) || - ieee80211_is_nullfunc(hdr->frame_control) || - ieee80211_is_qos_nullfunc(hdr->frame_control) || + ieee80211_is_any_nullfunc(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; @@ -1488,8 +1487,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) * Drop (qos-)data::nullfunc frames silently, since they * are used only to control station power saving mode. */ - if (ieee80211_is_nullfunc(hdr->frame_control) || - ieee80211_is_qos_nullfunc(hdr->frame_control)) { + if (ieee80211_is_any_nullfunc(hdr->frame_control)) { I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc); /* @@ -1741,19 +1739,34 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) return result; } +void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cache->entries); i++) + skb_queue_head_init(&cache->entries[i].skb_list); +} + +void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cache->entries); i++) + __skb_queue_purge(&cache->entries[i].skb_list); +} + static inline struct ieee80211_fragment_entry * -ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata, +ieee80211_reassemble_add(struct ieee80211_fragment_cache *cache, unsigned int frag, unsigned int seq, int rx_queue, struct sk_buff **skb) { struct ieee80211_fragment_entry *entry; - entry = &sdata->fragments[sdata->fragment_next++]; - if (sdata->fragment_next >= IEEE80211_FRAGMENT_MAX) - sdata->fragment_next = 0; + entry = &cache->entries[cache->next++]; + if (cache->next >= IEEE80211_FRAGMENT_MAX) + cache->next = 0; - if (!skb_queue_empty(&entry->skb_list)) - __skb_queue_purge(&entry->skb_list); + __skb_queue_purge(&entry->skb_list); __skb_queue_tail(&entry->skb_list, *skb); /* no need for locking */ *skb = NULL; @@ -1768,14 +1781,14 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata, } static inline struct ieee80211_fragment_entry * -ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, +ieee80211_reassemble_find(struct ieee80211_fragment_cache *cache, unsigned int frag, unsigned int seq, int rx_queue, struct ieee80211_hdr *hdr) { struct ieee80211_fragment_entry *entry; int i, idx; - idx = sdata->fragment_next; + idx = cache->next; for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) { struct ieee80211_hdr *f_hdr; @@ -1783,7 +1796,7 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, if (idx < 0) idx = IEEE80211_FRAGMENT_MAX - 1; - entry = &sdata->fragments[idx]; + entry = &cache->entries[idx]; if (skb_queue_empty(&entry->skb_list) || entry->seq != seq || entry->rx_queue != rx_queue || entry->last_frag + 1 != frag) @@ -1810,16 +1823,27 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, return NULL; } +static bool requires_sequential_pn(struct ieee80211_rx_data *rx, __le16 fc) +{ + return rx->key && + (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) && + ieee80211_has_protected(fc); +} + static ieee80211_rx_result debug_noinline ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) { + struct ieee80211_fragment_cache *cache = &rx->sdata->frags; struct ieee80211_hdr *hdr; u16 sc; __le16 fc; unsigned int frag, seq; struct ieee80211_fragment_entry *entry; struct sk_buff *skb; - struct ieee80211_rx_status *status; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); hdr = (struct ieee80211_hdr *)rx->skb->data; fc = hdr->frame_control; @@ -1830,14 +1854,15 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) sc = le16_to_cpu(hdr->seq_ctrl); frag = sc & IEEE80211_SCTL_FRAG; - if (is_multicast_ether_addr(hdr->addr1)) { - I802_DEBUG_INC(rx->local->dot11MulticastReceivedFrameCount); - goto out_no_led; - } + if (rx->sta) + cache = &rx->sta->frags; if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) goto out; + if (is_multicast_ether_addr(hdr->addr1)) + return RX_DROP_MONITOR; + I802_DEBUG_INC(rx->local->rx_handlers_fragments); if (skb_linearize(rx->skb)) @@ -1853,20 +1878,17 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) if (frag == 0) { /* This is the first fragment of a new frame. */ - entry = ieee80211_reassemble_add(rx->sdata, frag, seq, + entry = ieee80211_reassemble_add(cache, frag, seq, rx->seqno_idx, &(rx->skb)); - if (rx->key && - (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP || - rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 || - rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP || - rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) && - ieee80211_has_protected(fc)) { + if (requires_sequential_pn(rx, fc)) { int queue = rx->security_idx; /* Store CCMP/GCMP PN so that we can verify that the * next fragment has a sequential PN value. */ entry->check_sequential_pn = true; + entry->is_protected = true; + entry->key_color = rx->key->color; memcpy(entry->last_pn, rx->key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN); @@ -1878,6 +1900,11 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) sizeof(rx->key->u.gcmp.rx_pn[queue])); BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN != IEEE80211_GCMP_PN_LEN); + } else if (rx->key && + (ieee80211_has_protected(fc) || + (status->flag & RX_FLAG_DECRYPTED))) { + entry->is_protected = true; + entry->key_color = rx->key->color; } return RX_QUEUED; } @@ -1885,7 +1912,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) /* This is a fragment for a frame that should already be pending in * fragment cache. Add this fragment to the end of the pending entry. */ - entry = ieee80211_reassemble_find(rx->sdata, frag, seq, + entry = ieee80211_reassemble_find(cache, frag, seq, rx->seqno_idx, hdr); if (!entry) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); @@ -1900,25 +1927,39 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) if (entry->check_sequential_pn) { int i; u8 pn[IEEE80211_CCMP_PN_LEN], *rpn; - int queue; - if (!rx->key || - (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP && - rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 && - rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP && - rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256)) + if (!requires_sequential_pn(rx, fc)) + return RX_DROP_UNUSABLE; + + /* Prevent mixed key and fragment cache attacks */ + if (entry->key_color != rx->key->color) return RX_DROP_UNUSABLE; + memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN); for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) { pn[i]++; if (pn[i]) break; } - queue = rx->security_idx; - rpn = rx->key->u.ccmp.rx_pn[queue]; + + rpn = rx->ccm_gcm.pn; if (memcmp(pn, rpn, IEEE80211_CCMP_PN_LEN)) return RX_DROP_UNUSABLE; memcpy(entry->last_pn, pn, IEEE80211_CCMP_PN_LEN); + } else if (entry->is_protected && + (!rx->key || + (!ieee80211_has_protected(fc) && + !(status->flag & RX_FLAG_DECRYPTED)) || + rx->key->color != entry->key_color)) { + /* Drop this as a mixed key or fragment cache attack, even + * if for TKIP Michael MIC should protect us, and WEP is a + * lost cause anyway. + */ + return RX_DROP_UNUSABLE; + } else if (entry->is_protected && rx->key && + entry->key_color != rx->key->color && + (status->flag & RX_FLAG_DECRYPTED)) { + return RX_DROP_UNUSABLE; } skb_pull(rx->skb, ieee80211_hdrlen(fc)); @@ -1950,7 +1991,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) out: ieee80211_led_rx(rx->local); - out_no_led: if (rx->sta) rx->sta->rx_stats.packets++; return RX_CONTINUE; @@ -1966,6 +2006,7 @@ static int ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx) static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) { + struct ieee80211_hdr *hdr = (void *)rx->skb->data; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); @@ -1976,9 +2017,34 @@ static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) if (status->flag & RX_FLAG_DECRYPTED) return 0; + /* check mesh EAPOL frames first */ + if (unlikely(rx->sta && ieee80211_vif_is_mesh(&rx->sdata->vif) && + ieee80211_is_data(fc))) { + struct ieee80211s_hdr *mesh_hdr; + u16 hdr_len = ieee80211_hdrlen(fc); + u16 ethertype_offset; + __be16 ethertype; + + if (!ether_addr_equal(hdr->addr1, rx->sdata->vif.addr)) + goto drop_check; + + /* make sure fixed part of mesh header is there, also checks skb len */ + if (!pskb_may_pull(rx->skb, hdr_len + 6)) + goto drop_check; + + mesh_hdr = (struct ieee80211s_hdr *)(skb->data + hdr_len); + ethertype_offset = hdr_len + ieee80211_get_mesh_hdrlen(mesh_hdr) + + sizeof(rfc1042_header); + + if (skb_copy_bits(rx->skb, ethertype_offset, ðertype, 2) == 0 && + ethertype == rx->sdata->control_port_protocol) + return 0; + } + +drop_check: /* Drop unencrypted frames if key is set. */ if (unlikely(!ieee80211_has_protected(fc) && - !ieee80211_is_nullfunc(fc) && + !ieee80211_is_any_nullfunc(fc) && ieee80211_is_data(fc) && rx->key)) return -EACCES; @@ -2081,13 +2147,13 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc) struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; /* - * Allow EAPOL frames to us/the PAE group address regardless - * of whether the frame was encrypted or not. + * Allow EAPOL frames to us/the PAE group address regardless of + * whether the frame was encrypted or not, and always disallow + * all other destination addresses for them. */ - if (ehdr->h_proto == rx->sdata->control_port_protocol && - (ether_addr_equal(ehdr->h_dest, rx->sdata->vif.addr) || - ether_addr_equal(ehdr->h_dest, pae_group_addr))) - return true; + if (unlikely(ehdr->h_proto == rx->sdata->control_port_protocol)) + return ether_addr_equal(ehdr->h_dest, rx->sdata->vif.addr) || + ether_addr_equal(ehdr->h_dest, pae_group_addr); if (ieee80211_802_1x_port_control(rx) || ieee80211_drop_unencrypted(rx, fc)) @@ -2116,6 +2182,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) if ((sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && + ehdr->h_proto != rx->sdata->control_port_protocol && (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { if (is_multicast_ether_addr(ehdr->h_dest)) { /* @@ -2168,9 +2235,30 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) #endif if (skb) { + struct ethhdr *ehdr = (struct ethhdr *)skb->data; + /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); + + /* + * 802.1X over 802.11 requires that the authenticator address + * be used for EAPOL frames. However, 802.1X allows the use of + * the PAE group address instead. If the interface is part of + * a bridge and we pass the frame with the PAE group address, + * then the bridge will forward it to the network (even if the + * client was not associated yet), which isn't supposed to + * happen. + * To avoid that, rewrite the destination address to our own + * address, so that the authenticator (e.g. hostapd) will see + * the frame, but bridge won't forward it anywhere else. Note + * that due to earlier filtering, the only other address can + * be the PAE group address. + */ + if (unlikely(skb->protocol == sdata->control_port_protocol && + !ether_addr_equal(ehdr->h_dest, sdata->vif.addr))) + ether_addr_copy(ehdr->h_dest, sdata->vif.addr); + if (rx->napi) napi_gro_receive(rx->napi, skb); else @@ -2234,6 +2322,23 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) if (skb_linearize(skb)) return RX_DROP_UNUSABLE; + if (rx->key) { + /* + * We should not receive A-MSDUs on pre-HT connections, + * and HT connections cannot use old ciphers. Thus drop + * them, as in those cases we couldn't even have SPP + * A-MSDUs or such. + */ + switch (rx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + case WLAN_CIPHER_SUITE_TKIP: + return RX_DROP_UNUSABLE; + default: + break; + } + } + ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, rx->sdata->vif.type, rx->local->hw.extra_tx_headroom, true); @@ -2817,7 +2922,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.u.measurement.action_code) { case WLAN_ACTION_SPCT_MSR_REQ: - if (status->band != IEEE80211_BAND_5GHZ) + if (status->band != NL80211_BAND_5GHZ) break; if (len < (IEEE80211_MIN_ACTION_SIZE + @@ -3041,9 +3146,18 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP): /* process for all: mesh, mlme, ibss */ break; + case cpu_to_le16(IEEE80211_STYPE_DEAUTH): + if (is_multicast_ether_addr(mgmt->da) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_MONITOR; + + /* process only for station/IBSS */ + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_ADHOC) + return RX_DROP_MONITOR; + break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): - case cpu_to_le16(IEEE80211_STYPE_DEAUTH): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) @@ -3334,7 +3448,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || - ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2) || + !is_valid_ether_addr(hdr->addr2)) return false; if (ieee80211_is_beacon(hdr->frame_control)) return true; @@ -3623,7 +3738,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, WARN_ON_ONCE(softirq_count() == 0); - if (WARN_ON(status->band >= IEEE80211_NUM_BANDS)) + if (WARN_ON(status->band >= NUM_NL80211_BANDS)) goto drop; sband = local->hw.wiphy->bands[status->band]; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index acbe182b75d1..efc28f6b25c3 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -270,7 +270,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) n_chans = req->n_channels; } else { do { - if (local->hw_scan_band == IEEE80211_NUM_BANDS) + if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = 0; @@ -482,7 +482,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; - enum ieee80211_band band = local->hw.conf.chandef.chan->band; + enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 tx_flags; scan_req = rcu_dereference_protected(local->scan_req, @@ -949,7 +949,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; int ret = -EBUSY, i, n_ch = 0; - enum ieee80211_band band; + enum nl80211_band band; mutex_lock(&local->mtx); @@ -961,7 +961,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, if (!channels) { int max_n; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band]) continue; @@ -1081,7 +1081,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; - u32 rate_masks[IEEE80211_NUM_BANDS] = {}; + u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; size_t len; @@ -1093,7 +1093,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, if (!local->ops->sched_scan_start) return -ENOTSUPP; - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 06e6ac8cc693..2ddc661f0988 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -23,11 +23,11 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band current_band, + enum nl80211_band current_band, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie) { - enum ieee80211_band new_band; + enum nl80211_band new_band; int new_freq; u8 new_chan_no; struct ieee80211_channel *new_chan; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index a450be6812b8..5f2c8aeb9bd3 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2,6 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2018-2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -242,6 +243,24 @@ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { + /* + * If we had used sta_info_pre_move_state() then we might not + * have gone through the state transitions down again, so do + * it here now (and warn if it's inserted). + * + * This will clear state such as fast TX/RX that may have been + * allocated during state transitions. + */ + while (sta->sta_state > IEEE80211_STA_NONE) { + int ret; + + WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); + + ret = sta_info_move_state(sta, sta->sta_state - 1); + if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) + break; + } + if (sta->rate_ctrl) rate_control_free_sta(sta); @@ -337,6 +356,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sdata = sdata; sta->rx_stats.last_rx = jiffies; + ieee80211_init_frag_cache(&sta->frags); + sta->sta_state = IEEE80211_STA_NONE; /* Mark TID as unreserved */ @@ -439,6 +460,19 @@ static int sta_info_insert_check(struct sta_info *sta) is_multicast_ether_addr(sta->sta.addr))) return -EINVAL; + /* Strictly speaking this isn't necessary as we hold the mutex, but + * the rhashtable code can't really deal with that distinction. We + * do require the mutex for correctness though. + */ + rcu_read_lock(); + lockdep_assert_held(&sdata->local->sta_mtx); + if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && + ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { + rcu_read_unlock(); + return -ENOTUNIQ; + } + rcu_read_unlock(); + return 0; } @@ -552,9 +586,10 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) out_drop_sta: local->num_sta--; synchronize_net(); - __cleanup_single_sta(sta); + cleanup_single_sta(sta); out_err: mutex_unlock(&local->sta_mtx); + kfree(sinfo); rcu_read_lock(); return err; } @@ -566,22 +601,17 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) might_sleep(); + mutex_lock(&local->sta_mtx); + err = sta_info_insert_check(sta); if (err) { + sta_info_free(local, sta); + mutex_unlock(&local->sta_mtx); rcu_read_lock(); - goto out_free; + return err; } - mutex_lock(&local->sta_mtx); - - err = sta_info_insert_finish(sta); - if (err) - goto out_free; - - return 0; - out_free: - sta_info_free(local, sta); - return err; + return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) @@ -905,6 +935,11 @@ static void __sta_info_destroy_part2(struct sta_info *sta) might_sleep(); lockdep_assert_held(&local->sta_mtx); + if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { + ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); + WARN_ON_ONCE(ret); + } + /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); @@ -942,6 +977,8 @@ static void __sta_info_destroy_part2(struct sta_info *sta) ieee80211_sta_debugfs_remove(sta); ieee80211_recalc_min_chandef(sdata); + ieee80211_destroy_frag_cache(&sta->frags); + cleanup_single_sta(sta); } @@ -1776,6 +1813,10 @@ int sta_info_move_state(struct sta_info *sta, set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); } + if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + sta->sdata->vif.type == NL80211_IFTYPE_AP) + cfg80211_send_layer2_update(sta->sdata->dev, + sta->sta.addr); break; default: break; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 15b0150283b6..b2e5928b1f7b 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -324,6 +324,34 @@ struct mesh_sta { DECLARE_EWMA(signal, 1024, 8) +/* + * IEEE 802.11-2016 (10.6 "Defragmentation") recommends support for "concurrent + * reception of at least one MSDU per access category per associated STA" + * on APs, or "at least one MSDU per access category" on other interface types. + * + * This limit can be increased by changing this define, at the cost of slower + * frame reassembly and increased memory use while fragments are pending. + */ +#define IEEE80211_FRAGMENT_MAX 4 + +struct ieee80211_fragment_entry { + struct sk_buff_head skb_list; + unsigned long first_frag_time; + u16 seq; + u16 extra_len; + u16 last_frag; + u8 rx_queue; + u8 check_sequential_pn:1, /* needed for CCMP/GCMP */ + is_protected:1; + u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ + unsigned int key_color; +}; + +struct ieee80211_fragment_cache { + struct ieee80211_fragment_entry entries[IEEE80211_FRAGMENT_MAX]; + unsigned int next; +}; + /** * struct sta_info - STA information * @@ -384,6 +412,7 @@ DECLARE_EWMA(signal, 1024, 8) * @tx_stats: TX statistics * @rx_stats: RX statistics * @status_stats: TX status statistics + * @frags: fragment cache */ struct sta_info { /* General information, mostly static */ @@ -493,6 +522,8 @@ struct sta_info { struct cfg80211_chan_def tdls_chandef; + struct ieee80211_fragment_cache frags; + /* keep last! */ struct ieee80211_sta sta; }; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index d221300e59e5..618479e0d648 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -474,8 +474,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { - if (ieee80211_is_nullfunc(hdr->frame_control) || - ieee80211_is_qos_nullfunc(hdr->frame_control)) + if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, GFP_ATOMIC); @@ -905,7 +904,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) I802_DEBUG_INC(local->dot11FailedCount); } - if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && + if (ieee80211_is_any_nullfunc(fc) && ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index ce2ece424384..9eeacf4db494 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -4,7 +4,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015 - 2016 Intel Deutschland GmbH * * This file is GPLv2 as found in COPYING. */ @@ -15,6 +15,7 @@ #include <linux/rtnetlink.h> #include "ieee80211_i.h" #include "driver-ops.h" +#include "rate.h" /* give usermode some time for retries in setting up the TDLS session */ #define TDLS_PEER_SETUP_TIMEOUT (15 * HZ) @@ -46,7 +47,7 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, NL80211_FEATURE_TDLS_CHANNEL_SWITCH; bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && !ifmgd->tdls_wider_bw_prohibited; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; bool vht = sband && sband->vht_cap.vht_supported; u8 *pos = (void *)skb_put(skb, 10); @@ -183,7 +184,7 @@ static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata, if (status_code != 0) return 0; - if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_2GHZ) { + if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_2GHZ) { return WLAN_CAPABILITY_SHORT_SLOT_TIME | WLAN_CAPABILITY_SHORT_PREAMBLE; } @@ -302,7 +303,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, /* IEEE802.11ac-2013 Table E-4 */ u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 }; struct cfg80211_chan_def uc = sta->tdls_chandef; - enum nl80211_chan_width max_width = ieee80211_get_sta_bw(&sta->sta); + enum nl80211_chan_width max_width = ieee80211_sta_cap_chan_bw(sta); int i; /* only support upgrading non-narrow channels up to 80Mhz */ @@ -313,7 +314,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, if (max_width > NL80211_CHAN_WIDTH_80) max_width = NL80211_CHAN_WIDTH_80; - if (uc.width == max_width) + if (uc.width >= max_width) return; /* * Channel usage constrains in the IEEE802.11ac-2013 specification only @@ -324,6 +325,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, for (i = 0; i < ARRAY_SIZE(centers_80mhz); i++) if (abs(uc.chan->center_freq - centers_80mhz[i]) <= 30) { uc.center_freq1 = centers_80mhz[i]; + uc.center_freq2 = 0; uc.width = NL80211_CHAN_WIDTH_80; break; } @@ -332,7 +334,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, return; /* proceed to downgrade the chandef until usable or the same */ - while (uc.width > max_width && + while (uc.width > max_width || !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, sdata->wdev.iftype)) ieee80211_chandef_downgrade(&uc); @@ -355,7 +357,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, u8 action_code, bool initiator, const u8 *extra_ies, size_t extra_ies_len) { - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct ieee80211_sta_ht_cap ht_cap; @@ -542,7 +544,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t offset = 0, noffset; struct sta_info *sta, *ap_sta; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); u8 *pos; mutex_lock(&local->sta_mtx); @@ -609,7 +611,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); /* only include VHT-operation if not on the 2.4GHz band */ - if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { + if (band != NL80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { /* * if both peers support WIDER_BW, we can expand the chandef to * a wider compatible one, up to 80MHz @@ -1242,18 +1244,44 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, return ret; } -static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata) +static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; + enum nl80211_chan_width width; + struct ieee80211_supported_band *sband; mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (conf) { + width = conf->def.width; + sband = local->hw.wiphy->bands[conf->def.chan->band]; ctx = container_of(conf, struct ieee80211_chanctx, conf); ieee80211_recalc_chanctx_chantype(local, ctx); + + /* if width changed and a peer is given, update its BW */ + if (width != conf->def.width && sta && + test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) { + enum ieee80211_sta_rx_bandwidth bw; + + bw = ieee80211_chan_width_to_rx_bw(conf->def.width); + bw = min(bw, ieee80211_sta_cap_rx_bw(sta)); + if (bw != sta->sta.bandwidth) { + sta->sta.bandwidth = bw; + rate_control_rate_update(local, sband, sta, + IEEE80211_RC_BW_CHANGED); + /* + * if a TDLS peer BW was updated, we need to + * recalc the chandef width again, to get the + * correct chanctx min_def + */ + ieee80211_recalc_chanctx_chantype(local, ctx); + } + } + } mutex_unlock(&local->chanctx_mtx); } @@ -1350,8 +1378,6 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, break; } - iee80211_tdls_recalc_chanctx(sdata); - mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); if (!sta) { @@ -1360,6 +1386,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, break; } + iee80211_tdls_recalc_chanctx(sdata, sta); iee80211_tdls_recalc_ht_protection(sdata, sta); set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); @@ -1390,7 +1417,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, iee80211_tdls_recalc_ht_protection(sdata, NULL); mutex_unlock(&local->sta_mtx); - iee80211_tdls_recalc_chanctx(sdata); + iee80211_tdls_recalc_chanctx(sdata, NULL); break; default: ret = -ENOTSUPP; @@ -1746,7 +1773,7 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, u8 target_channel, oper_class; bool local_initiator; struct sta_info *sta; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_tdls_data *tf = (void *)skb->data; struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); int baselen = offsetof(typeof(*tf), u.chan_switch_req.variable); @@ -1778,10 +1805,10 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, if ((oper_class == 112 || oper_class == 2 || oper_class == 3 || oper_class == 4 || oper_class == 5 || oper_class == 6) && target_channel < 14) - band = IEEE80211_BAND_5GHZ; + band = NL80211_BAND_5GHZ; else - band = target_channel < 14 ? IEEE80211_BAND_2GHZ : - IEEE80211_BAND_5GHZ; + band = target_channel < 14 ? NL80211_BAND_2GHZ : + NL80211_BAND_5GHZ; freq = ieee80211_channel_to_frequency(target_channel, band); if (freq == 0) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 913e959b03cf..8000864ddfc6 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -396,7 +396,7 @@ TRACE_EVENT(drv_bss_info_changed, __field(u32, sync_device_ts) __field(u8, sync_dtim_count) __field(u32, basic_rates) - __array(int, mcast_rate, IEEE80211_NUM_BANDS) + __array(int, mcast_rate, NUM_NL80211_BANDS) __field(u16, ht_operation_mode) __field(s32, cqm_rssi_thold); __field(s32, cqm_rssi_hyst); @@ -1253,8 +1253,8 @@ TRACE_EVENT(drv_set_bitrate_mask, TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; - __entry->legacy_2g = mask->control[IEEE80211_BAND_2GHZ].legacy; - __entry->legacy_5g = mask->control[IEEE80211_BAND_5GHZ].legacy; + __entry->legacy_2g = mask->control[NL80211_BAND_2GHZ].legacy; + __entry->legacy_5g = mask->control[NL80211_BAND_5GHZ].legacy; ), TP_printk( diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1e43c597db17..89eb87474fdf 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -150,7 +150,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, rate = DIV_ROUND_UP(r->bitrate, 1 << shift); switch (sband->band) { - case IEEE80211_BAND_2GHZ: { + case NL80211_BAND_2GHZ: { u32 flag; if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) flag = IEEE80211_RATE_MANDATORY_G; @@ -160,13 +160,13 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, mrate = r->bitrate; break; } - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; case IEEE80211_BAND_60GHZ: /* TODO, for now fall through */ - case IEEE80211_NUM_BANDS: + case NUM_NL80211_BANDS: WARN_ON(1); break; } @@ -291,7 +291,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) && !ieee80211_is_probe_req(hdr->frame_control) && - !ieee80211_is_nullfunc(hdr->frame_control)) + !ieee80211_is_any_nullfunc(hdr->frame_control)) /* * When software scanning only nullfunc frames (to notify * the sleep state to the AP) and probe requests (for the @@ -1600,19 +1600,24 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, /* device xmit handlers */ +enum ieee80211_encrypt { + ENCRYPT_NO, + ENCRYPT_MGMT, + ENCRYPT_DATA, +}; + static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - int head_need, bool may_encrypt) + int head_need, + enum ieee80211_encrypt encrypt) { struct ieee80211_local *local = sdata->local; - struct ieee80211_hdr *hdr; bool enc_tailroom; int tail_need = 0; - hdr = (struct ieee80211_hdr *) skb->data; - enc_tailroom = may_encrypt && - (sdata->crypto_tx_tailroom_needed_cnt || - ieee80211_is_mgmt(hdr->frame_control)); + enc_tailroom = encrypt == ENCRYPT_MGMT || + (encrypt == ENCRYPT_DATA && + sdata->crypto_tx_tailroom_needed_cnt); if (enc_tailroom) { tail_need = IEEE80211_ENCRYPT_TAILROOM; @@ -1645,21 +1650,27 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; int headroom; - bool may_encrypt; + enum ieee80211_encrypt encrypt; - may_encrypt = !(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT); + if (info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT) + encrypt = ENCRYPT_NO; + else if (ieee80211_is_mgmt(hdr->frame_control)) + encrypt = ENCRYPT_MGMT; + else + encrypt = ENCRYPT_DATA; headroom = local->tx_headroom; - if (may_encrypt) + if (encrypt != ENCRYPT_NO) headroom += sdata->encrypt_headroom; headroom -= skb_headroom(skb); headroom = max_t(int, 0, headroom); - if (ieee80211_skb_resize(sdata, skb, headroom, may_encrypt)) { + if (ieee80211_skb_resize(sdata, skb, headroom, encrypt)) { ieee80211_free_txskb(&local->hw, skb); return; } + /* reload after potential resize */ hdr = (struct ieee80211_hdr *) skb->data; info->control.vif = &sdata->vif; @@ -2031,7 +2042,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, u16 info_id = 0; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_sub_if_data *ap_sdata; - enum ieee80211_band band; + enum nl80211_band band; int ret; if (IS_ERR(sta)) @@ -2352,7 +2363,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, head_need += sdata->encrypt_headroom; head_need += local->tx_headroom; head_need = max_t(int, 0, head_need); - if (ieee80211_skb_resize(sdata, skb, head_need, true)) { + if (ieee80211_skb_resize(sdata, skb, head_need, ENCRYPT_DATA)) { ieee80211_free_txskb(&local->hw, skb); skb = NULL; return ERR_PTR(-ENOMEM); @@ -2762,7 +2773,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (unlikely(ieee80211_skb_resize(sdata, skb, max_t(int, extra_head + hw_headroom - skb_headroom(skb), 0), - false))) { + ENCRYPT_NO))) { kfree_skb(skb); return true; } @@ -3336,7 +3347,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, struct sk_buff *skb = NULL; struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_tx_rate_control txrc; struct ieee80211_chanctx_conf *chanctx_conf; int csa_off_base = 0; @@ -3904,7 +3915,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum ieee80211_band band) + enum nl80211_band band) { int ac = ieee802_1d_to_ac[tid & 7]; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index c8cc9bd7cac1..391de2fa08a8 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -59,7 +59,7 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) } } -int ieee80211_frame_duration(enum ieee80211_band band, size_t len, +int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift) { @@ -77,7 +77,7 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len, * is assumed to be 0 otherwise. */ - if (band == IEEE80211_BAND_5GHZ || erp) { + if (band == NL80211_BAND_5GHZ || erp) { /* * OFDM: * @@ -129,7 +129,7 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len, /* Exported duration function for driver use */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - enum ieee80211_band band, + enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate) { @@ -939,16 +939,22 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elem_parse_failed = true; break; case WLAN_EID_VHT_OPERATION: - if (elen >= sizeof(struct ieee80211_vht_operation)) + if (elen >= sizeof(struct ieee80211_vht_operation)) { elems->vht_operation = (void *)pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_OPMODE_NOTIF: - if (elen > 0) + if (elen > 0) { elems->opmode_notif = pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; @@ -1126,7 +1132,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); use_11b = (chanctx_conf && - chanctx_conf->def.chan->band == IEEE80211_BAND_2GHZ) && + chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) && !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE); rcu_read_unlock(); @@ -1298,7 +1304,7 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, u8 *buffer, size_t buffer_len, const u8 *ie, size_t ie_len, - enum ieee80211_band band, + enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, size_t *offset) @@ -1372,7 +1378,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, pos += ext_rates_len; } - if (chandef->chan && sband->band == IEEE80211_BAND_2GHZ) { + if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { if (end - pos < 3) goto out_err; *pos++ = WLAN_EID_DS_PARAMS; @@ -1476,7 +1482,7 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, memset(ie_desc, 0, sizeof(*ie_desc)); - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { pos += ieee80211_build_preq_ies_band(local, buffer + pos, @@ -1519,7 +1525,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; int ies_len; - u32 rate_masks[IEEE80211_NUM_BANDS] = {}; + u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; /* @@ -1579,7 +1585,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band band, u32 *basic_rates) + enum nl80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; size_t num_rates; @@ -2472,7 +2478,7 @@ int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -2517,7 +2523,7 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index c38b2f07a919..47f708cd3e5b 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -299,7 +299,30 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) return IEEE80211_STA_RX_BW_80; } -static enum ieee80211_sta_rx_bandwidth +enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta) +{ + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + u32 cap_width; + + if (!vht_cap->vht_supported) { + if (!sta->sta.ht_cap.ht_supported) + return NL80211_CHAN_WIDTH_20_NOHT; + + return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20; + } + + cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; + + if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) + return NL80211_CHAN_WIDTH_160; + else if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) + return NL80211_CHAN_WIDTH_80P80; + + return NL80211_CHAN_WIDTH_80; +} + +enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { switch (width) { @@ -327,10 +350,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) bw = ieee80211_sta_cap_rx_bw(sta); bw = min(bw, sta->cur_max_bandwidth); - - /* do not cap the BW of TDLS WIDER_BW peers by the bss */ - if (!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) - bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); + bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; } @@ -378,7 +398,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -427,7 +447,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index cb439e06919f..3884bb1a59dd 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -161,8 +161,8 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) update_iv: /* update IV in key information to be able to detect replays */ - rx->key->u.tkip.rx[rx->security_idx].iv32 = rx->tkip_iv32; - rx->key->u.tkip.rx[rx->security_idx].iv16 = rx->tkip_iv16; + rx->key->u.tkip.rx[rx->security_idx].iv32 = rx->tkip.iv32; + rx->key->u.tkip.rx[rx->security_idx].iv16 = rx->tkip.iv16; return RX_CONTINUE; @@ -292,8 +292,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) key, skb->data + hdrlen, skb->len - hdrlen, rx->sta->sta.addr, hdr->addr1, hwaccel, rx->security_idx, - &rx->tkip_iv32, - &rx->tkip_iv16); + &rx->tkip.iv32, + &rx->tkip.iv16); if (res != TKIP_DECRYPT_OK) return RX_DROP_UNUSABLE; @@ -519,6 +519,9 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; @@ -553,6 +556,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, } memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN); + if (unlikely(ieee80211_is_frag(hdr))) + memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN); } /* Remove CCMP header and MIC */ @@ -749,6 +754,9 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; @@ -784,6 +792,8 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) } memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN); + if (unlikely(ieee80211_is_frag(hdr))) + memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN); } /* Remove GCMP header and MIC */ diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index a13d02b7cee4..55ed8a97b33f 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -158,7 +158,7 @@ err_tfm0: crypto_free_blkcipher(key->tfm0); err_tfm: for (i = 0; i < ARRAY_SIZE(key->tfm); i++) - if (key->tfm[i]) + if (!IS_ERR_OR_NULL(key->tfm[i])) crypto_free_aead(key->tfm[i]); kzfree(key); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index c2ce7dec5198..50d9138b2a1c 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -470,16 +470,15 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; - int err; if (!ipv6_stub) return ERR_PTR(-EAFNOSUPPORT); memset(&fl6, 0, sizeof(fl6)); memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); - err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6); - if (err) - return ERR_PTR(err); + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + return ERR_CAST(dst); dev = dst->dev; dev_hold(dev); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f83c255d7da2..c577907e3715 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -71,7 +71,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' depends on NETWORK_SECMARK - default m if NETFILTER_ADVANCED=n + default y if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from @@ -1327,22 +1327,6 @@ config NETFILTER_XT_MATCH_PKTTYPE To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_MATCH_QTAGUID - bool '"quota, tag, owner" match and stats support' - depends on NETFILTER_XT_MATCH_SOCKET - depends on NETFILTER_XT_MATCH_OWNER=n - help - This option replaces the `owner' match. In addition to matching - on uid, it keeps stats based on a tag assigned to a socket. - The full tag is comprised of a UID and an accounting tag. - The tags are assignable to sockets from user space (e.g. a download - manager can assign the socket to another UID for accounting). - Stats and control are done via /proc/net/xt_qtaguid/. - It replaces owner as it takes the same arguments, but should - really be recognized by the iptables tool. - - If unsure, say `N'. - config NETFILTER_XT_MATCH_QUOTA tristate '"quota" match support' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index fcdc061130d7..6c8d78f4f248 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -157,7 +157,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o -obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index b0bc475f641e..7d08a170ac27 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -66,9 +66,9 @@ mtype_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - ip_set_free(map->members); if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); + ip_set_free(map->members); ip_set_free(map); set->data = NULL; @@ -81,7 +81,7 @@ mtype_flush(struct ip_set *set) if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); - memset(map->members, 0, map->memsize); + bitmap_zero(map->members, map->elements); } static int diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 4783efff0bde..a4c104a4977f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -40,7 +40,7 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -222,7 +222,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, u32 first_ip, u32 last_ip, u32 elements, u32 hosts, u8 netmask) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -315,7 +315,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 29dde208381d..0e961690510d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -46,7 +46,7 @@ enum { /* Type structure */ struct bitmap_ipmac { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -297,7 +297,7 @@ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -361,7 +361,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 7f0c733358a4..6771b362a123 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -34,7 +34,7 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -207,7 +207,7 @@ static bool init_map_port(struct ip_set *set, struct bitmap_port *map, u16 first_port, u16 last_port) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_port = first_port; @@ -250,7 +250,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -ENOMEM; map->elements = elements; - map->memsize = bitmap_bytes(0, map->elements); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index caa26184f7e3..3231030a73ed 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -379,6 +379,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; + if (align < ip_set_extensions[id].align) + align = ip_set_extensions[id].align; len = ALIGN(len, ip_set_extensions[id].align); set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; @@ -1619,6 +1621,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; + u32 lineno; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME] || @@ -1635,7 +1638,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); - ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); + ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index e5336ab36d67..7b69d1ad8f3e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -102,31 +102,17 @@ htable_size(u8 hbits) { size_t hsize; - /* We must fit both into u32 in jhash and size_t */ + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } -/* Compute htable_bits from the user input parameter hashsize */ -static u8 -htable_bits(u32 hashsize) -{ - /* Assume that hashsize == 2^htable_bits */ - u8 bits = fls(hashsize - 1); - - if (jhash_size(bits) != hashsize) - /* Round up to the first 2^n value */ - bits = fls(hashsize); - - return bits; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -1309,7 +1295,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, get_random_bytes(&h->initval, sizeof(h->initval)); set->timeout = IPSET_NO_TIMEOUT; - hbits = htable_bits(hashsize); + /* Compute htable_bits from the user input parameter hashsize. + * Assume that hashsize == 2^htable_bits, + * otherwise round up to the first 2^n value. + */ + hbits = fls(hashsize - 1); hsize = htable_size(hbits); if (hsize == 0) { kfree(h); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index bbede95c9f68..085711b35a99 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -60,7 +60,7 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) - opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; + opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE; list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 85ca189bdc3d..de196dd95dcd 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1368,6 +1368,10 @@ int __init ip_vs_conn_init(void) int idx; /* Compute size and mask */ + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { + pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; + } ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 79f4ffe7291a..7065410b13ad 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1226,7 +1226,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; - svc->flags = u->flags; + svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; svc->ipvs = ipvs; @@ -2383,6 +2383,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; + } else if (!len) { + /* No more commands with len == 0 below */ + ret = -EINVAL; + goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; @@ -2459,9 +2463,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); - break; - default: - ret = -EINVAL; } out_unlock: @@ -3921,6 +3922,11 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; +#ifdef CONFIG_IP_VS_DEBUG + /* Global sysctls must be ro in non-init netns */ + if (!net_eq(net, &init_net)) + tbl[idx++].mode = 0444; +#endif ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d637d151d0c7..d32a3fd4528e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -898,9 +898,9 @@ __nf_conntrack_alloc(struct net *net, /* Don't set timer yet: wait for confirmation */ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); write_pnet(&ct->ct_net, net); - memset(&ct->__nfct_init_offset[0], 0, + memset(&ct->__nfct_init_offset, 0, offsetof(struct nf_conn, proto) - - offsetof(struct nf_conn, __nfct_init_offset[0])); + offsetof(struct nf_conn, __nfct_init_offset)); if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0) goto out_free; @@ -1600,7 +1600,7 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) { int i, bucket, rc; unsigned int hashsize, old_size; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 9511af04dc81..15495b956855 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1225,6 +1225,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { { .name = "Q.931", .me = THIS_MODULE, + .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET6, .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), .tuple.dst.protonum = IPPROTO_TCP, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3a24c01cb909..b349b8410ec8 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1022,6 +1022,8 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (!tb[CTA_TUPLE_IP]) return -EINVAL; + if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6) + return -EOPNOTSUPP; tuple->src.l3num = l3num; err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); @@ -3390,6 +3392,9 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) list_for_each_entry(net, net_exit_list, exit_list) ctnetlink_net_exit(net); + + /* wait for other cpus until they are done with ctnl_notifiers */ + synchronize_rcu(); } static struct pernet_operations ctnetlink_net_ops = { diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 5588c7ae1ac2..9f9f92d637ad 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -71,24 +71,32 @@ EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) /* PptpControlMessageType names */ -const char *const pptp_msg_name[] = { - "UNKNOWN_MESSAGE", - "START_SESSION_REQUEST", - "START_SESSION_REPLY", - "STOP_SESSION_REQUEST", - "STOP_SESSION_REPLY", - "ECHO_REQUEST", - "ECHO_REPLY", - "OUT_CALL_REQUEST", - "OUT_CALL_REPLY", - "IN_CALL_REQUEST", - "IN_CALL_REPLY", - "IN_CALL_CONNECT", - "CALL_CLEAR_REQUEST", - "CALL_DISCONNECT_NOTIFY", - "WAN_ERROR_NOTIFY", - "SET_LINK_INFO" +static const char *const pptp_msg_name_array[PPTP_MSG_MAX + 1] = { + [0] = "UNKNOWN_MESSAGE", + [PPTP_START_SESSION_REQUEST] = "START_SESSION_REQUEST", + [PPTP_START_SESSION_REPLY] = "START_SESSION_REPLY", + [PPTP_STOP_SESSION_REQUEST] = "STOP_SESSION_REQUEST", + [PPTP_STOP_SESSION_REPLY] = "STOP_SESSION_REPLY", + [PPTP_ECHO_REQUEST] = "ECHO_REQUEST", + [PPTP_ECHO_REPLY] = "ECHO_REPLY", + [PPTP_OUT_CALL_REQUEST] = "OUT_CALL_REQUEST", + [PPTP_OUT_CALL_REPLY] = "OUT_CALL_REPLY", + [PPTP_IN_CALL_REQUEST] = "IN_CALL_REQUEST", + [PPTP_IN_CALL_REPLY] = "IN_CALL_REPLY", + [PPTP_IN_CALL_CONNECT] = "IN_CALL_CONNECT", + [PPTP_CALL_CLEAR_REQUEST] = "CALL_CLEAR_REQUEST", + [PPTP_CALL_DISCONNECT_NOTIFY] = "CALL_DISCONNECT_NOTIFY", + [PPTP_WAN_ERROR_NOTIFY] = "WAN_ERROR_NOTIFY", + [PPTP_SET_LINK_INFO] = "SET_LINK_INFO" }; + +const char *pptp_msg_name(u_int16_t msg) +{ + if (msg > PPTP_MSG_MAX) + return pptp_msg_name_array[0]; + + return pptp_msg_name_array[msg]; +} EXPORT_SYMBOL(pptp_msg_name); #endif @@ -278,7 +286,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; msg = ntohs(ctlh->messageType); - pr_debug("inbound control message %s\n", pptp_msg_name[msg]); + pr_debug("inbound control message %s\n", pptp_msg_name(msg)); switch (msg) { case PPTP_START_SESSION_REPLY: @@ -313,7 +321,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, pcid = pptpReq->ocack.peersCallID; if (info->pns_call_id != pcid) goto invalid; - pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], + pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name(msg), ntohs(cid), ntohs(pcid)); if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { @@ -330,7 +338,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, goto invalid; cid = pptpReq->icreq.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->cstate = PPTP_CALL_IN_REQ; info->pac_call_id = cid; break; @@ -349,7 +357,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, if (info->pns_call_id != pcid) goto invalid; - pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); + pr_debug("%s, PCID=%X\n", pptp_msg_name(msg), ntohs(pcid)); info->cstate = PPTP_CALL_IN_CONF; /* we expect a GRE connection from PAC to PNS */ @@ -359,7 +367,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, case PPTP_CALL_DISCONNECT_NOTIFY: /* server confirms disconnect */ cid = pptpReq->disc.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->cstate = PPTP_CALL_NONE; /* untrack this call id, unexpect GRE packets */ @@ -386,7 +394,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, invalid: pr_debug("invalid %s: type=%d cid=%u pcid=%u " "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + pptp_msg_name(msg), msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; @@ -406,7 +414,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; msg = ntohs(ctlh->messageType); - pr_debug("outbound control message %s\n", pptp_msg_name[msg]); + pr_debug("outbound control message %s\n", pptp_msg_name(msg)); switch (msg) { case PPTP_START_SESSION_REQUEST: @@ -428,7 +436,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, info->cstate = PPTP_CALL_OUT_REQ; /* track PNS call id */ cid = pptpReq->ocreq.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->pns_call_id = cid; break; @@ -442,7 +450,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, pcid = pptpReq->icack.peersCallID; if (info->pac_call_id != pcid) goto invalid; - pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], + pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name(msg), ntohs(cid), ntohs(pcid)); if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { @@ -482,7 +490,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, invalid: pr_debug("invalid %s: type=%d cid=%u pcid=%u " "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + pptp_msg_name(msg), msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index e84a578dbe35..d76afafdc699 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -134,7 +134,7 @@ static int __init nf_nat_ftp_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index 1fb2258c3535..8039bcd60758 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -107,7 +107,7 @@ static int __init nf_nat_irc_init(void) } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) +static int warn_set(const char *val, const struct kernel_param *kp) { printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index c8a4a48bced9..8be604eb6961 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -34,6 +34,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, int length = (th->doff * 4) - sizeof(*th); u8 buf[40], *ptr; + if (unlikely(length < 0)) + return false; + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); if (ptr == NULL) return false; @@ -50,6 +53,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, length--; continue; default: + if (length < 2) + return true; opsize = *ptr++; if (opsize < 2) return true; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7967af0da82..6203995003a5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2849,12 +2849,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; +err3: + ops->destroy(set); err2: kfree(set); err1: diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 99bc2f87a974..204be9374657 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -130,7 +130,7 @@ next_rule: list_for_each_entry_continue_rcu(rule, &chain->rules, list) { /* This rule is not active, skip. */ - if (unlikely(rule->genmask & (1 << gencursor))) + if (unlikely(rule->genmask & gencursor)) continue; rulenum++; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 044559c10e98..f01764b94b34 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -309,14 +309,14 @@ replay: #endif { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } @@ -406,7 +406,7 @@ ack: * pointing to the batch header. */ nfnl_err_reset(&err_list); - netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); + netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM); status |= NFNL_BATCH_FAILURE; goto done; } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index ac143ae4f7b6..63a9d5fd00c0 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -355,10 +355,14 @@ static int nfnl_cthelper_update(const struct nlattr * const tb[], struct nf_conntrack_helper *helper) { + u32 size; int ret; - if (tb[NFCTH_PRIV_DATA_LEN]) - return -EBUSY; + if (tb[NFCTH_PRIV_DATA_LEN]) { + size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); + if (size != helper->data_len) + return -EBUSY; + } if (tb[NFCTH_POLICY]) { ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]); @@ -711,6 +715,8 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { [NFCTH_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, + [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, }, + [NFCTH_STATUS] = { .type = NLA_U32, }, }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 54cde78c2718..ebce25080f7f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -486,7 +486,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure; if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { + skb_mac_header_was_set(entskb)) { struct nfqnl_msg_packet_hw phw; int len; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 0a5df0cbaa28..d6fcfc995420 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -121,6 +121,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return PTR_ERR(set); } + if (set->ops->update == NULL) + return -EOPNOTSUPP; + if (set->flags & NFT_SET_CONSTANT) return -EBUSY; @@ -186,8 +189,10 @@ static int nft_dynset_init(const struct nft_ctx *ctx, nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR, priv->expr->ops->size); if (set->flags & NFT_SET_TIMEOUT) { - if (timeout || set->timeout) + if (timeout || set->timeout) { + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); + } } priv->timeout = timeout; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index ba7aed13e174..a81f6bf42d1f 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -34,6 +34,9 @@ static void nft_exthdr_eval(const struct nft_expr *expr, unsigned int offset = 0; int err; + if (pkt->skb->protocol != htons(ETH_P_IPV6)) + goto err; + err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (err < 0) goto err; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index a97a5bf716be..8a69c6fdced5 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -200,7 +200,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) goto err; - *dest = sk->sk_classid; + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif default: diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index ee2d71753746..182704b980d1 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -135,7 +135,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->type = NF_NAT_MANIP_DST; break; default: - return -EINVAL; + return -EOPNOTSUPP; } err = nft_nat_validate(ctx, expr, NULL); @@ -157,7 +157,9 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6); break; default: - return -EAFNOSUPPORT; + if (tb[NFTA_NAT_REG_ADDR_MIN]) + return -EAFNOSUPPORT; + break; } priv->family = family; @@ -206,7 +208,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (tb[NFTA_NAT_FLAGS]) { priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + return -EOPNOTSUPP; } return 0; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 09b4b07eb676..ab3e7b14de09 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -74,7 +74,9 @@ static void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset; - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index cdafbd38a456..2909c34dda7a 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -266,11 +266,66 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) } EXPORT_SYMBOL_GPL(xt_request_find_target); + +static int xt_obj_to_user(u16 __user *psize, u16 size, + void __user *pname, const char *name, + u8 __user *prev, u8 rev) +{ + if (put_user(size, psize)) + return -EFAULT; + if (copy_to_user(pname, name, strlen(name) + 1)) + return -EFAULT; + if (put_user(rev, prev)) + return -EFAULT; + + return 0; +} + +#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE) \ + xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size, \ + U->u.user.name, K->u.kernel.TYPE->name, \ + &U->u.user.revision, K->u.kernel.TYPE->revision) + +int xt_data_to_user(void __user *dst, const void *src, + int usersize, int size) +{ + usersize = usersize ? : size; + if (copy_to_user(dst, src, usersize)) + return -EFAULT; + if (usersize != size && clear_user(dst + usersize, size - usersize)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(xt_data_to_user); + +#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ + xt_data_to_user(U->data, K->data, \ + K->u.kernel.TYPE->usersize, \ + C_SIZE ? : K->u.kernel.TYPE->TYPE##size) + +int xt_match_to_user(const struct xt_entry_match *m, + struct xt_entry_match __user *u) +{ + return XT_OBJ_TO_USER(u, m, match, 0) || + XT_DATA_TO_USER(u, m, match, 0); +} +EXPORT_SYMBOL_GPL(xt_match_to_user); + +int xt_target_to_user(const struct xt_entry_target *t, + struct xt_entry_target __user *u) +{ + return XT_OBJ_TO_USER(u, t, target, 0) || + XT_DATA_TO_USER(u, t, target, 0); +} +EXPORT_SYMBOL_GPL(xt_target_to_user); + static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_match *m; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision > *bestp) @@ -279,6 +334,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -291,6 +347,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) const struct xt_target *t; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision > *bestp) @@ -299,6 +356,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -312,12 +370,10 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, { int have_rev, best = -1; - mutex_lock(&xt[af].mutex); if (target == 1) have_rev = target_revfn(af, name, revision, &best); else have_rev = match_revfn(af, name, revision, &best); - mutex_unlock(&xt[af].mutex); /* Nothing at all? Return 0 to try loading module. */ if (best == -1) { @@ -566,7 +622,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, { const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; - int pad, off = xt_compat_match_offset(match); + int off = xt_compat_match_offset(match); u_int16_t msize = cm->u.user.match_size; char name[sizeof(m->u.user.name)]; @@ -576,9 +632,6 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, match->compat_from_user(m->data, cm->data); else memcpy(m->data, cm->data, msize - sizeof(*cm)); - pad = XT_ALIGN(match->matchsize) - match->matchsize; - if (pad > 0) - memset(m->data + match->matchsize, 0, pad); msize += off; m->u.user.match_size = msize; @@ -924,7 +977,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, { const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; - int pad, off = xt_compat_target_offset(target); + int off = xt_compat_target_offset(target); u_int16_t tsize = ct->u.user.target_size; char name[sizeof(t->u.user.name)]; @@ -934,9 +987,6 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, target->compat_from_user(t->data, ct->data); else memcpy(t->data, ct->data, tsize - sizeof(*ct)); - pad = XT_ALIGN(target->targetsize) - target->targetsize; - if (pad > 0) - memset(t->data + target->targetsize, 0, pad); tsize += off; t->u.user.target_size = tsize; @@ -1144,6 +1194,9 @@ xt_replace_table(struct xt_table *table, smp_wmb(); table->private = newinfo; + /* make sure all cpus see new ->private value */ + smp_mb(); + /* * Even though table entries have now been swapped, other CPU's * may still be using the old entries. This is okay, because diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index febcfac7e3df..f2fe60ce286f 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -380,6 +380,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .name = "CT", .family = NFPROTO_UNSPEC, .targetsize = sizeof(struct xt_ct_target_info), + .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v0, .destroy = xt_ct_tg_destroy_v0, .target = xt_ct_target_v0, @@ -391,6 +392,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_UNSPEC, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -402,6 +404,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_UNSPEC, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index b0f4f1bca61f..18e4fd8aa166 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -505,6 +505,7 @@ static struct xt_target idletimer_tg __read_mostly = { .family = NFPROTO_UNSPEC, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 0858fe17e14a..2d1c5c169a26 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -198,6 +198,7 @@ static struct xt_target led_tg_reg __read_mostly = { .family = NFPROTO_UNSPEC, .target = led_tg, .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), .checkentry = led_tg_check, .destroy = led_tg_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 0be96f8475f7..6768d4d2ffd0 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -107,6 +107,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; + if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) + return -ENAMETOOLONG; + if (unlikely(!rnd_inited)) { get_random_bytes(&jhash_rnd, sizeof(jhash_rnd)); rnd_inited = true; @@ -178,6 +181,7 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = { .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 3eff7b67cdf2..d597b504a82e 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -127,6 +127,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), + .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, @@ -138,6 +139,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), + .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 3ab591e73ec0..7f4414d26a66 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) * belonging to established connections going through that one. */ static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet_lookup_listener(net, &tcp_hashinfo, skb, + ip_hdrlen(skb) + + __tcp_hdrlen(tcph), saddr, sport, daddr, dport, in->ifindex); @@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, #ifdef XT_TPROXY_HAVE_IPV6 static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, + thoff + __tcp_hdrlen(tcph), saddr, sport, daddr, ntohs(dport), in->ifindex); @@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); @@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(par->net, tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, @@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, - &iph->saddr, laddr, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, + tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index dffee9d47ec4..7e7746cc45d6 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -8,8 +8,10 @@ */ #include <linux/module.h> +#include <linux/syscalls.h> #include <linux/skbuff.h> #include <linux/filter.h> +#include <linux/bpf.h> #include <linux/netfilter/xt_bpf.h> #include <linux/netfilter/x_tables.h> @@ -20,15 +22,18 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_bpf"); MODULE_ALIAS("ip6t_bpf"); -static int bpf_mt_check(const struct xt_mtchk_param *par) +static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, + struct bpf_prog **ret) { - struct xt_bpf_info *info = par->matchinfo; struct sock_fprog_kern program; - program.len = info->bpf_program_num_elem; - program.filter = info->bpf_program; + if (len > XT_BPF_MAX_NUM_INSTR) + return -EINVAL; - if (bpf_prog_create(&info->filter, &program)) { + program.len = len; + program.filter = insns; + + if (bpf_prog_create(ret, &program)) { pr_info("bpf: check failed: parse error\n"); return -EINVAL; } @@ -36,6 +41,53 @@ static int bpf_mt_check(const struct xt_mtchk_param *par) return 0; } +static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + *ret = prog; + return 0; +} + +static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) +{ + if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) + return -EINVAL; + + *ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER); + return PTR_ERR_OR_ZERO(*ret); + +} + +static int bpf_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info *info = par->matchinfo; + + return __bpf_mt_check_bytecode(info->bpf_program, + info->bpf_program_num_elem, + &info->filter); +} + +static int bpf_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info_v1 *info = par->matchinfo; + + if (info->mode == XT_BPF_MODE_BYTECODE) + return __bpf_mt_check_bytecode(info->bpf_program, + info->bpf_program_num_elem, + &info->filter); + else if (info->mode == XT_BPF_MODE_FD_ELF) + return __bpf_mt_check_fd(info->fd, &info->filter); + else if (info->mode == XT_BPF_MODE_PATH_PINNED) + return __bpf_mt_check_path(info->path, &info->filter); + else + return -EINVAL; +} + static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; @@ -43,31 +95,60 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) return BPF_PROG_RUN(info->filter, skb); } +static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_bpf_info_v1 *info = par->matchinfo; + + return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb); +} + static void bpf_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_bpf_info *info = par->matchinfo; + + bpf_prog_destroy(info->filter); +} + +static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par) +{ + const struct xt_bpf_info_v1 *info = par->matchinfo; + bpf_prog_destroy(info->filter); } -static struct xt_match bpf_mt_reg __read_mostly = { - .name = "bpf", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = bpf_mt_check, - .match = bpf_mt, - .destroy = bpf_mt_destroy, - .matchsize = sizeof(struct xt_bpf_info), - .me = THIS_MODULE, +static struct xt_match bpf_mt_reg[] __read_mostly = { + { + .name = "bpf", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check, + .match = bpf_mt, + .destroy = bpf_mt_destroy, + .matchsize = sizeof(struct xt_bpf_info), + .usersize = offsetof(struct xt_bpf_info, filter), + .me = THIS_MODULE, + }, + { + .name = "bpf", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check_v1, + .match = bpf_mt_v1, + .destroy = bpf_mt_destroy_v1, + .matchsize = sizeof(struct xt_bpf_info_v1), + .usersize = offsetof(struct xt_bpf_info_v1, filter), + .me = THIS_MODULE, + }, }; static int __init bpf_mt_init(void) { - return xt_register_match(&bpf_mt_reg); + return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg)); } static void __exit bpf_mt_exit(void) { - xt_unregister_match(&bpf_mt_reg); + xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg)); } module_init(bpf_mt_init); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index a1d126f29463..54eaeb45ce99 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -42,7 +42,8 @@ cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) if (skb->sk == NULL || !sk_fullsock(skb->sk)) return false; - return (info->id == skb->sk->sk_classid) ^ info->invert; + return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ + info->invert; } static struct xt_match cgroup_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 99bbc829868d..aab11f7ab547 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -437,6 +437,7 @@ static struct xt_match connlimit_mt_reg __read_mostly = { .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 7381be0cdcdf..d893cc133de4 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -726,6 +726,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, @@ -737,6 +738,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index bef850596558..e84de7656289 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -193,6 +193,7 @@ static struct xt_match limit_mt_reg __read_mostly = { .compat_from_user = limit_mt_compat_from_user, .compat_to_user = limit_mt_compat_to_user, #endif + .usersize = offsetof(struct xt_rateinfo, prev), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 3048a7e3a90a..e9adf6ebca30 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -62,6 +62,7 @@ static struct xt_match nfacct_mt_reg __read_mostly = { .match = nfacct_mt, .destroy = nfacct_mt_destroy, .matchsize = sizeof(struct xt_nfacct_match_info), + .usersize = offsetof(struct xt_nfacct_match_info, nfacct), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c deleted file mode 100644 index e82524b21d07..000000000000 --- a/net/netfilter/xt_qtaguid.c +++ /dev/null @@ -1,3015 +0,0 @@ -/* - * Kernel iptables module to track stats for packets based on user tags. - * - * (C) 2011 Google, Inc - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * There are run-time debug flags enabled via the debug_mask module param, or - * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h. - */ -#define DEBUG - -#include <linux/file.h> -#include <linux/inetdevice.h> -#include <linux/module.h> -#include <linux/miscdevice.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_qtaguid.h> -#include <linux/ratelimit.h> -#include <linux/seq_file.h> -#include <linux/skbuff.h> -#include <linux/workqueue.h> -#include <net/addrconf.h> -#include <net/sock.h> -#include <net/tcp.h> -#include <net/udp.h> - -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -#include <linux/netfilter_ipv6/ip6_tables.h> -#endif - -#include <linux/netfilter/xt_socket.h> -#include "xt_qtaguid_internal.h" -#include "xt_qtaguid_print.h" -#include "../../fs/proc/internal.h" - -/* - * We only use the xt_socket funcs within a similar context to avoid unexpected - * return values. - */ -#define XT_SOCKET_SUPPORTED_HOOKS \ - ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) - - -static const char *module_procdirname = "xt_qtaguid"; -static struct proc_dir_entry *xt_qtaguid_procdir; - -static unsigned int proc_iface_perms = S_IRUGO; -module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR); - -static struct proc_dir_entry *xt_qtaguid_stats_file; -static unsigned int proc_stats_perms = S_IRUGO; -module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR); - -static struct proc_dir_entry *xt_qtaguid_ctrl_file; - -/* Everybody can write. But proc_ctrl_write_limited is true by default which - * limits what can be controlled. See the can_*() functions. - */ -static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO; -module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR); - -/* Limited by default, so the gid of the ctrl and stats proc entries - * will limit what can be done. See the can_*() functions. - */ -static bool proc_stats_readall_limited = true; -static bool proc_ctrl_write_limited = true; - -module_param_named(stats_readall_limited, proc_stats_readall_limited, bool, - S_IRUGO | S_IWUSR); -module_param_named(ctrl_write_limited, proc_ctrl_write_limited, bool, - S_IRUGO | S_IWUSR); - -/* - * Limit the number of active tags (via socket tags) for a given UID. - * Multiple processes could share the UID. - */ -static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS; -module_param(max_sock_tags, int, S_IRUGO | S_IWUSR); - -/* - * After the kernel has initiallized this module, it is still possible - * to make it passive. - * Setting passive to Y: - * - the iface stats handling will not act on notifications. - * - iptables matches will never match. - * - ctrl commands silently succeed. - * - stats are always empty. - * This is mostly usefull when a bug is suspected. - */ -static bool module_passive; -module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR); - -/* - * Control how qtaguid data is tracked per proc/uid. - * Setting tag_tracking_passive to Y: - * - don't create proc specific structs to track tags - * - don't check that active tag stats exceed some limits. - * - don't clean up socket tags on process exits. - * This is mostly usefull when a bug is suspected. - */ -static bool qtu_proc_handling_passive; -module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool, - S_IRUGO | S_IWUSR); - -#define QTU_DEV_NAME "xt_qtaguid" - -uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK; -module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR); - -/*---------------------------------------------------------------------------*/ -static const char *iface_stat_procdirname = "iface_stat"; -static struct proc_dir_entry *iface_stat_procdir; -/* - * The iface_stat_all* will go away once userspace gets use to the new fields - * that have a format line. - */ -static const char *iface_stat_all_procfilename = "iface_stat_all"; -static struct proc_dir_entry *iface_stat_all_procfile; -static const char *iface_stat_fmt_procfilename = "iface_stat_fmt"; -static struct proc_dir_entry *iface_stat_fmt_procfile; - - -static LIST_HEAD(iface_stat_list); -static DEFINE_SPINLOCK(iface_stat_list_lock); - -static struct rb_root sock_tag_tree = RB_ROOT; -static DEFINE_SPINLOCK(sock_tag_list_lock); - -static struct rb_root tag_counter_set_tree = RB_ROOT; -static DEFINE_SPINLOCK(tag_counter_set_list_lock); - -static struct rb_root uid_tag_data_tree = RB_ROOT; -static DEFINE_SPINLOCK(uid_tag_data_tree_lock); - -static struct rb_root proc_qtu_data_tree = RB_ROOT; -/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */ - -static struct qtaguid_event_counts qtu_events; -/*----------------------------------------------*/ -static bool can_manipulate_uids(void) -{ - /* root pwnd */ - return in_egroup_p(xt_qtaguid_ctrl_file->gid) - || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || unlikely(!proc_ctrl_write_limited) - || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid)); -} - -static bool can_impersonate_uid(kuid_t uid) -{ - return uid_eq(uid, current_fsuid()) || can_manipulate_uids(); -} - -static bool can_read_other_uid_stats(kuid_t uid) -{ - /* root pwnd */ - return in_egroup_p(xt_qtaguid_stats_file->gid) - || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || uid_eq(uid, current_fsuid()) - || unlikely(!proc_stats_readall_limited) - || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid)); -} - -static inline void dc_add_byte_packets(struct data_counters *counters, int set, - enum ifs_tx_rx direction, - enum ifs_proto ifs_proto, - int bytes, - int packets) -{ - counters->bpc[set][direction][ifs_proto].bytes += bytes; - counters->bpc[set][direction][ifs_proto].packets += packets; -} - -static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag) -{ - struct rb_node *node = root->rb_node; - - while (node) { - struct tag_node *data = rb_entry(node, struct tag_node, node); - int result; - RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): " - " node=%p data=%p\n", tag, node, data); - result = tag_compare(tag, data->tag); - RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): " - " data.tag=0x%llx (uid=%u) res=%d\n", - tag, data->tag, get_uid_from_tag(data->tag), result); - if (result < 0) - node = node->rb_left; - else if (result > 0) - node = node->rb_right; - else - return data; - } - return NULL; -} - -static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* Figure out where to put new node */ - while (*new) { - struct tag_node *this = rb_entry(*new, struct tag_node, - node); - int result = tag_compare(data->tag, this->tag); - RB_DEBUG("qtaguid: %s(): tag=0x%llx" - " (uid=%u)\n", __func__, - this->tag, - get_uid_from_tag(this->tag)); - parent = *new; - if (result < 0) - new = &((*new)->rb_left); - else if (result > 0) - new = &((*new)->rb_right); - else - BUG(); - } - - /* Add new node and rebalance tree. */ - rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); -} - -static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root) -{ - tag_node_tree_insert(&data->tn, root); -} - -static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag) -{ - struct tag_node *node = tag_node_tree_search(root, tag); - if (!node) - return NULL; - return rb_entry(&node->node, struct tag_stat, tn.node); -} - -static void tag_counter_set_tree_insert(struct tag_counter_set *data, - struct rb_root *root) -{ - tag_node_tree_insert(&data->tn, root); -} - -static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root, - tag_t tag) -{ - struct tag_node *node = tag_node_tree_search(root, tag); - if (!node) - return NULL; - return rb_entry(&node->node, struct tag_counter_set, tn.node); - -} - -static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root) -{ - tag_node_tree_insert(&data->tn, root); -} - -static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag) -{ - struct tag_node *node = tag_node_tree_search(root, tag); - if (!node) - return NULL; - return rb_entry(&node->node, struct tag_ref, tn.node); -} - -static struct sock_tag *sock_tag_tree_search(struct rb_root *root, - const struct sock *sk) -{ - struct rb_node *node = root->rb_node; - - while (node) { - struct sock_tag *data = rb_entry(node, struct sock_tag, - sock_node); - if (sk < data->sk) - node = node->rb_left; - else if (sk > data->sk) - node = node->rb_right; - else - return data; - } - return NULL; -} - -static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* Figure out where to put new node */ - while (*new) { - struct sock_tag *this = rb_entry(*new, struct sock_tag, - sock_node); - parent = *new; - if (data->sk < this->sk) - new = &((*new)->rb_left); - else if (data->sk > this->sk) - new = &((*new)->rb_right); - else - BUG(); - } - - /* Add new node and rebalance tree. */ - rb_link_node(&data->sock_node, parent, new); - rb_insert_color(&data->sock_node, root); -} - -static void sock_tag_tree_erase(struct rb_root *st_to_free_tree) -{ - struct rb_node *node; - struct sock_tag *st_entry; - - node = rb_first(st_to_free_tree); - while (node) { - st_entry = rb_entry(node, struct sock_tag, sock_node); - node = rb_next(node); - CT_DEBUG("qtaguid: %s(): " - "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__, - st_entry->sk, - st_entry->tag, - get_uid_from_tag(st_entry->tag)); - rb_erase(&st_entry->sock_node, st_to_free_tree); - sock_put(st_entry->sk); - kfree(st_entry); - } -} - -static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root, - const pid_t pid) -{ - struct rb_node *node = root->rb_node; - - while (node) { - struct proc_qtu_data *data = rb_entry(node, - struct proc_qtu_data, - node); - if (pid < data->pid) - node = node->rb_left; - else if (pid > data->pid) - node = node->rb_right; - else - return data; - } - return NULL; -} - -static void proc_qtu_data_tree_insert(struct proc_qtu_data *data, - struct rb_root *root) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* Figure out where to put new node */ - while (*new) { - struct proc_qtu_data *this = rb_entry(*new, - struct proc_qtu_data, - node); - parent = *new; - if (data->pid < this->pid) - new = &((*new)->rb_left); - else if (data->pid > this->pid) - new = &((*new)->rb_right); - else - BUG(); - } - - /* Add new node and rebalance tree. */ - rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); -} - -static void uid_tag_data_tree_insert(struct uid_tag_data *data, - struct rb_root *root) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* Figure out where to put new node */ - while (*new) { - struct uid_tag_data *this = rb_entry(*new, - struct uid_tag_data, - node); - parent = *new; - if (data->uid < this->uid) - new = &((*new)->rb_left); - else if (data->uid > this->uid) - new = &((*new)->rb_right); - else - BUG(); - } - - /* Add new node and rebalance tree. */ - rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); -} - -static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root, - uid_t uid) -{ - struct rb_node *node = root->rb_node; - - while (node) { - struct uid_tag_data *data = rb_entry(node, - struct uid_tag_data, - node); - if (uid < data->uid) - node = node->rb_left; - else if (uid > data->uid) - node = node->rb_right; - else - return data; - } - return NULL; -} - -/* - * Allocates a new uid_tag_data struct if needed. - * Returns a pointer to the found or allocated uid_tag_data. - * Returns a PTR_ERR on failures, and lock is not held. - * If found is not NULL: - * sets *found to true if not allocated. - * sets *found to false if allocated. - */ -struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res) -{ - struct uid_tag_data *utd_entry; - - /* Look for top level uid_tag_data for the UID */ - utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid); - DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry); - - if (found_res) - *found_res = utd_entry; - if (utd_entry) - return utd_entry; - - utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC); - if (!utd_entry) { - pr_err("qtaguid: get_uid_data(%u): " - "tag data alloc failed\n", uid); - return ERR_PTR(-ENOMEM); - } - - utd_entry->uid = uid; - utd_entry->tag_ref_tree = RB_ROOT; - uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree); - DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry); - return utd_entry; -} - -/* Never returns NULL. Either PTR_ERR or a valid ptr. */ -static struct tag_ref *new_tag_ref(tag_t new_tag, - struct uid_tag_data *utd_entry) -{ - struct tag_ref *tr_entry; - int res; - - if (utd_entry->num_active_tags + 1 > max_sock_tags) { - pr_info("qtaguid: new_tag_ref(0x%llx): " - "tag ref alloc quota exceeded. max=%d\n", - new_tag, max_sock_tags); - res = -EMFILE; - goto err_res; - - } - - tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC); - if (!tr_entry) { - pr_err("qtaguid: new_tag_ref(0x%llx): " - "tag ref alloc failed\n", - new_tag); - res = -ENOMEM; - goto err_res; - } - tr_entry->tn.tag = new_tag; - /* tr_entry->num_sock_tags handled by caller */ - utd_entry->num_active_tags++; - tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree); - DR_DEBUG("qtaguid: new_tag_ref(0x%llx): " - " inserted new tag ref %p\n", - new_tag, tr_entry); - return tr_entry; - -err_res: - return ERR_PTR(res); -} - -static struct tag_ref *lookup_tag_ref(tag_t full_tag, - struct uid_tag_data **utd_res) -{ - struct uid_tag_data *utd_entry; - struct tag_ref *tr_entry; - bool found_utd; - uid_t uid = get_uid_from_tag(full_tag); - - DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n", - full_tag, uid); - - utd_entry = get_uid_data(uid, &found_utd); - if (IS_ERR_OR_NULL(utd_entry)) { - if (utd_res) - *utd_res = utd_entry; - return NULL; - } - - tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag); - if (utd_res) - *utd_res = utd_entry; - DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n", - full_tag, utd_entry, tr_entry); - return tr_entry; -} - -/* Never returns NULL. Either PTR_ERR or a valid ptr. */ -static struct tag_ref *get_tag_ref(tag_t full_tag, - struct uid_tag_data **utd_res) -{ - struct uid_tag_data *utd_entry; - struct tag_ref *tr_entry; - - DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n", - full_tag); - tr_entry = lookup_tag_ref(full_tag, &utd_entry); - BUG_ON(IS_ERR_OR_NULL(utd_entry)); - if (!tr_entry) - tr_entry = new_tag_ref(full_tag, utd_entry); - - if (utd_res) - *utd_res = utd_entry; - DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n", - full_tag, utd_entry, tr_entry); - return tr_entry; -} - -/* Checks and maybe frees the UID Tag Data entry */ -static void put_utd_entry(struct uid_tag_data *utd_entry) -{ - /* Are we done with the UID tag data entry? */ - if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) && - !utd_entry->num_pqd) { - DR_DEBUG("qtaguid: %s(): " - "erase utd_entry=%p uid=%u " - "by pid=%u tgid=%u uid=%u\n", __func__, - utd_entry, utd_entry->uid, - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - BUG_ON(utd_entry->num_active_tags); - rb_erase(&utd_entry->node, &uid_tag_data_tree); - kfree(utd_entry); - } else { - DR_DEBUG("qtaguid: %s(): " - "utd_entry=%p still has %d tags %d proc_qtu_data\n", - __func__, utd_entry, utd_entry->num_active_tags, - utd_entry->num_pqd); - BUG_ON(!(utd_entry->num_active_tags || - utd_entry->num_pqd)); - } -} - -/* - * If no sock_tags are using this tag_ref, - * decrements refcount of utd_entry, removes tr_entry - * from utd_entry->tag_ref_tree and frees. - */ -static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry, - struct uid_tag_data *utd_entry) -{ - DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__, - tr_entry, tr_entry->tn.tag, - get_uid_from_tag(tr_entry->tn.tag)); - if (!tr_entry->num_sock_tags) { - BUG_ON(!utd_entry->num_active_tags); - utd_entry->num_active_tags--; - rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree); - DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry); - kfree(tr_entry); - } -} - -static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry) -{ - struct rb_node *node; - struct tag_ref *tr_entry; - tag_t acct_tag; - - DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__, - full_tag, get_uid_from_tag(full_tag)); - acct_tag = get_atag_from_tag(full_tag); - node = rb_first(&utd_entry->tag_ref_tree); - while (node) { - tr_entry = rb_entry(node, struct tag_ref, tn.node); - node = rb_next(node); - if (!acct_tag || tr_entry->tn.tag == full_tag) - free_tag_ref_from_utd_entry(tr_entry, utd_entry); - } -} - -static ssize_t read_proc_u64(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - uint64_t *valuep = PDE_DATA(file_inode(file)); - char tmp[24]; - size_t tmp_size; - - tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", *valuep); - return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size); -} - -static ssize_t read_proc_bool(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - bool *valuep = PDE_DATA(file_inode(file)); - char tmp[24]; - size_t tmp_size; - - tmp_size = scnprintf(tmp, sizeof(tmp), "%u\n", *valuep); - return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size); -} - -static int get_active_counter_set(tag_t tag) -{ - int active_set = 0; - struct tag_counter_set *tcs; - - MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)" - " (uid=%u)\n", - tag, get_uid_from_tag(tag)); - /* For now we only handle UID tags for active sets */ - tag = get_utag_from_tag(tag); - spin_lock_bh(&tag_counter_set_list_lock); - tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag); - if (tcs) - active_set = tcs->active_set; - spin_unlock_bh(&tag_counter_set_list_lock); - return active_set; -} - -/* - * Find the entry for tracking the specified interface. - * Caller must hold iface_stat_list_lock - */ -static struct iface_stat *get_iface_entry(const char *ifname) -{ - struct iface_stat *iface_entry; - - /* Find the entry for tracking the specified tag within the interface */ - if (ifname == NULL) { - pr_info("qtaguid: iface_stat: get() NULL device name\n"); - return NULL; - } - - /* Iterate over interfaces */ - list_for_each_entry(iface_entry, &iface_stat_list, list) { - if (!strcmp(ifname, iface_entry->ifname)) - goto done; - } - iface_entry = NULL; -done: - return iface_entry; -} - -/* This is for fmt2 only */ -static void pp_iface_stat_header(struct seq_file *m) -{ - seq_puts(m, - "ifname " - "total_skb_rx_bytes total_skb_rx_packets " - "total_skb_tx_bytes total_skb_tx_packets " - "rx_tcp_bytes rx_tcp_packets " - "rx_udp_bytes rx_udp_packets " - "rx_other_bytes rx_other_packets " - "tx_tcp_bytes tx_tcp_packets " - "tx_udp_bytes tx_udp_packets " - "tx_other_bytes tx_other_packets\n" - ); -} - -static void pp_iface_stat_line(struct seq_file *m, - struct iface_stat *iface_entry) -{ - struct data_counters *cnts; - int cnt_set = 0; /* We only use one set for the device */ - cnts = &iface_entry->totals_via_skb; - seq_printf(m, "%s %llu %llu %llu %llu %llu %llu %llu %llu " - "%llu %llu %llu %llu %llu %llu %llu %llu\n", - iface_entry->ifname, - dc_sum_bytes(cnts, cnt_set, IFS_RX), - dc_sum_packets(cnts, cnt_set, IFS_RX), - dc_sum_bytes(cnts, cnt_set, IFS_TX), - dc_sum_packets(cnts, cnt_set, IFS_TX), - cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes, - cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets, - cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes, - cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets, - cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes, - cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets, - cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes, - cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets, - cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes, - cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, - cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, - cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); -} - -struct proc_iface_stat_fmt_info { - int fmt; -}; - -static void *iface_stat_fmt_proc_start(struct seq_file *m, loff_t *pos) -{ - struct proc_iface_stat_fmt_info *p = m->private; - loff_t n = *pos; - - /* - * This lock will prevent iface_stat_update() from changing active, - * and in turn prevent an interface from unregistering itself. - */ - spin_lock_bh(&iface_stat_list_lock); - - if (unlikely(module_passive)) - return NULL; - - if (!n && p->fmt == 2) - pp_iface_stat_header(m); - - return seq_list_start(&iface_stat_list, n); -} - -static void *iface_stat_fmt_proc_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &iface_stat_list, pos); -} - -static void iface_stat_fmt_proc_stop(struct seq_file *m, void *p) -{ - spin_unlock_bh(&iface_stat_list_lock); -} - -static int iface_stat_fmt_proc_show(struct seq_file *m, void *v) -{ - struct proc_iface_stat_fmt_info *p = m->private; - struct iface_stat *iface_entry; - struct rtnl_link_stats64 dev_stats, *stats; - struct rtnl_link_stats64 no_dev_stats = {0}; - - - CT_DEBUG("qtaguid:proc iface_stat_fmt pid=%u tgid=%u uid=%u\n", - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - - iface_entry = list_entry(v, struct iface_stat, list); - - if (iface_entry->active) { - stats = dev_get_stats(iface_entry->net_dev, - &dev_stats); - } else { - stats = &no_dev_stats; - } - /* - * If the meaning of the data changes, then update the fmtX - * string. - */ - if (p->fmt == 1) { - seq_printf(m, "%s %d %llu %llu %llu %llu %llu %llu %llu %llu\n", - iface_entry->ifname, - iface_entry->active, - iface_entry->totals_via_dev[IFS_RX].bytes, - iface_entry->totals_via_dev[IFS_RX].packets, - iface_entry->totals_via_dev[IFS_TX].bytes, - iface_entry->totals_via_dev[IFS_TX].packets, - stats->rx_bytes, stats->rx_packets, - stats->tx_bytes, stats->tx_packets - ); - } else { - pp_iface_stat_line(m, iface_entry); - } - return 0; -} - -static const struct file_operations read_u64_fops = { - .read = read_proc_u64, - .llseek = default_llseek, -}; - -static const struct file_operations read_bool_fops = { - .read = read_proc_bool, - .llseek = default_llseek, -}; - -static void iface_create_proc_worker(struct work_struct *work) -{ - struct proc_dir_entry *proc_entry; - struct iface_stat_work *isw = container_of(work, struct iface_stat_work, - iface_work); - struct iface_stat *new_iface = isw->iface_entry; - - /* iface_entries are not deleted, so safe to manipulate. */ - proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir); - if (IS_ERR_OR_NULL(proc_entry)) { - pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n"); - kfree(isw); - return; - } - - new_iface->proc_ptr = proc_entry; - - proc_create_data("tx_bytes", proc_iface_perms, proc_entry, - &read_u64_fops, - &new_iface->totals_via_dev[IFS_TX].bytes); - proc_create_data("rx_bytes", proc_iface_perms, proc_entry, - &read_u64_fops, - &new_iface->totals_via_dev[IFS_RX].bytes); - proc_create_data("tx_packets", proc_iface_perms, proc_entry, - &read_u64_fops, - &new_iface->totals_via_dev[IFS_TX].packets); - proc_create_data("rx_packets", proc_iface_perms, proc_entry, - &read_u64_fops, - &new_iface->totals_via_dev[IFS_RX].packets); - proc_create_data("active", proc_iface_perms, proc_entry, - &read_bool_fops, &new_iface->active); - - IF_DEBUG("qtaguid: iface_stat: create_proc(): done " - "entry=%p dev=%s\n", new_iface, new_iface->ifname); - kfree(isw); -} - -/* - * Will set the entry's active state, and - * update the net_dev accordingly also. - */ -static void _iface_stat_set_active(struct iface_stat *entry, - struct net_device *net_dev, - bool activate) -{ - if (activate) { - entry->net_dev = net_dev; - entry->active = true; - IF_DEBUG("qtaguid: %s(%s): " - "enable tracking. rfcnt=%d\n", __func__, - entry->ifname, - __this_cpu_read(*net_dev->pcpu_refcnt)); - } else { - entry->active = false; - entry->net_dev = NULL; - IF_DEBUG("qtaguid: %s(%s): " - "disable tracking. rfcnt=%d\n", __func__, - entry->ifname, - __this_cpu_read(*net_dev->pcpu_refcnt)); - - } -} - -/* Caller must hold iface_stat_list_lock */ -static struct iface_stat *iface_alloc(struct net_device *net_dev) -{ - struct iface_stat *new_iface; - struct iface_stat_work *isw; - - new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC); - if (new_iface == NULL) { - pr_err("qtaguid: iface_stat: create(%s): " - "iface_stat alloc failed\n", net_dev->name); - return NULL; - } - new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC); - if (new_iface->ifname == NULL) { - pr_err("qtaguid: iface_stat: create(%s): " - "ifname alloc failed\n", net_dev->name); - kfree(new_iface); - return NULL; - } - spin_lock_init(&new_iface->tag_stat_list_lock); - new_iface->tag_stat_tree = RB_ROOT; - _iface_stat_set_active(new_iface, net_dev, true); - - /* - * ipv6 notifier chains are atomic :( - * No create_proc_read_entry() for you! - */ - isw = kmalloc(sizeof(*isw), GFP_ATOMIC); - if (!isw) { - pr_err("qtaguid: iface_stat: create(%s): " - "work alloc failed\n", new_iface->ifname); - _iface_stat_set_active(new_iface, net_dev, false); - kfree(new_iface->ifname); - kfree(new_iface); - return NULL; - } - isw->iface_entry = new_iface; - INIT_WORK(&isw->iface_work, iface_create_proc_worker); - schedule_work(&isw->iface_work); - list_add(&new_iface->list, &iface_stat_list); - return new_iface; -} - -static void iface_check_stats_reset_and_adjust(struct net_device *net_dev, - struct iface_stat *iface) -{ - struct rtnl_link_stats64 dev_stats, *stats; - bool stats_rewound; - - stats = dev_get_stats(net_dev, &dev_stats); - /* No empty packets */ - stats_rewound = - (stats->rx_bytes < iface->last_known[IFS_RX].bytes) - || (stats->tx_bytes < iface->last_known[IFS_TX].bytes); - - IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p " - "bytes rx/tx=%llu/%llu " - "active=%d last_known=%d " - "stats_rewound=%d\n", __func__, - net_dev ? net_dev->name : "?", - iface, net_dev, - stats->rx_bytes, stats->tx_bytes, - iface->active, iface->last_known_valid, stats_rewound); - - if (iface->active && iface->last_known_valid && stats_rewound) { - pr_warn_once("qtaguid: iface_stat: %s(%s): " - "iface reset its stats unexpectedly\n", __func__, - net_dev->name); - - iface->totals_via_dev[IFS_TX].bytes += - iface->last_known[IFS_TX].bytes; - iface->totals_via_dev[IFS_TX].packets += - iface->last_known[IFS_TX].packets; - iface->totals_via_dev[IFS_RX].bytes += - iface->last_known[IFS_RX].bytes; - iface->totals_via_dev[IFS_RX].packets += - iface->last_known[IFS_RX].packets; - iface->last_known_valid = false; - IF_DEBUG("qtaguid: %s(%s): iface=%p " - "used last known bytes rx/tx=%llu/%llu\n", __func__, - iface->ifname, iface, iface->last_known[IFS_RX].bytes, - iface->last_known[IFS_TX].bytes); - } -} - -/* - * Create a new entry for tracking the specified interface. - * Do nothing if the entry already exists. - * Called when an interface is configured with a valid IP address. - */ -static void iface_stat_create(struct net_device *net_dev, - struct in_ifaddr *ifa) -{ - struct in_device *in_dev = NULL; - const char *ifname; - struct iface_stat *entry; - __be32 ipaddr = 0; - struct iface_stat *new_iface; - - IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n", - net_dev ? net_dev->name : "?", - ifa, net_dev); - if (!net_dev) { - pr_err("qtaguid: iface_stat: create(): no net dev\n"); - return; - } - - ifname = net_dev->name; - if (!ifa) { - in_dev = in_dev_get(net_dev); - if (!in_dev) { - pr_err("qtaguid: iface_stat: create(%s): no inet dev\n", - ifname); - return; - } - IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n", - ifname, in_dev); - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - IF_DEBUG("qtaguid: iface_stat: create(%s): " - "ifa=%p ifa_label=%s\n", - ifname, ifa, - ifa->ifa_label); - if (!strcmp(ifname, ifa->ifa_label)) - break; - } - } - - if (!ifa) { - IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n", - ifname); - goto done_put; - } - ipaddr = ifa->ifa_local; - - spin_lock_bh(&iface_stat_list_lock); - entry = get_iface_entry(ifname); - if (entry != NULL) { - IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n", - ifname, entry); - iface_check_stats_reset_and_adjust(net_dev, entry); - _iface_stat_set_active(entry, net_dev, true); - IF_DEBUG("qtaguid: %s(%s): " - "tracking now %d on ip=%pI4\n", __func__, - entry->ifname, true, &ipaddr); - goto done_unlock_put; - } - - new_iface = iface_alloc(net_dev); - IF_DEBUG("qtaguid: iface_stat: create(%s): done " - "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr); -done_unlock_put: - spin_unlock_bh(&iface_stat_list_lock); -done_put: - if (in_dev) - in_dev_put(in_dev); -} - -static void iface_stat_create_ipv6(struct net_device *net_dev, - struct inet6_ifaddr *ifa) -{ - struct in_device *in_dev; - const char *ifname; - struct iface_stat *entry; - struct iface_stat *new_iface; - int addr_type; - - IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n", - ifa, net_dev, net_dev ? net_dev->name : ""); - if (!net_dev) { - pr_err("qtaguid: iface_stat: create6(): no net dev!\n"); - return; - } - ifname = net_dev->name; - - in_dev = in_dev_get(net_dev); - if (!in_dev) { - pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n", - ifname); - return; - } - - IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n", - ifname, in_dev); - - if (!ifa) { - IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n", - ifname); - goto done_put; - } - addr_type = ipv6_addr_type(&ifa->addr); - - spin_lock_bh(&iface_stat_list_lock); - entry = get_iface_entry(ifname); - if (entry != NULL) { - IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, - ifname, entry); - iface_check_stats_reset_and_adjust(net_dev, entry); - _iface_stat_set_active(entry, net_dev, true); - IF_DEBUG("qtaguid: %s(%s): " - "tracking now %d on ip=%pI6c\n", __func__, - entry->ifname, true, &ifa->addr); - goto done_unlock_put; - } - - new_iface = iface_alloc(net_dev); - IF_DEBUG("qtaguid: iface_stat: create6(%s): done " - "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr); - -done_unlock_put: - spin_unlock_bh(&iface_stat_list_lock); -done_put: - in_dev_put(in_dev); -} - -static struct sock_tag *get_sock_stat_nl(const struct sock *sk) -{ - MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk); - return sock_tag_tree_search(&sock_tag_tree, sk); -} - -static struct sock_tag *get_sock_stat(const struct sock *sk) -{ - struct sock_tag *sock_tag_entry; - MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk); - if (!sk) - return NULL; - spin_lock_bh(&sock_tag_list_lock); - sock_tag_entry = get_sock_stat_nl(sk); - spin_unlock_bh(&sock_tag_list_lock); - return sock_tag_entry; -} - -static int ipx_proto(const struct sk_buff *skb, - struct xt_action_param *par) -{ - int thoff = 0, tproto; - - switch (par->family) { - case NFPROTO_IPV6: - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) - MT_DEBUG("%s(): transport header not found in ipv6" - " skb=%p\n", __func__, skb); - break; - case NFPROTO_IPV4: - tproto = ip_hdr(skb)->protocol; - break; - default: - tproto = IPPROTO_RAW; - } - return tproto; -} - -static void -data_counters_update(struct data_counters *dc, int set, - enum ifs_tx_rx direction, int proto, int bytes) -{ - switch (proto) { - case IPPROTO_TCP: - dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1); - break; - case IPPROTO_UDP: - dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1); - break; - case IPPROTO_IP: - default: - dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes, - 1); - break; - } -} - -/* - * Update stats for the specified interface. Do nothing if the entry - * does not exist (when a device was never configured with an IP address). - * Called when an device is being unregistered. - */ -static void iface_stat_update(struct net_device *net_dev, bool stash_only) -{ - struct rtnl_link_stats64 dev_stats, *stats; - struct iface_stat *entry; - - stats = dev_get_stats(net_dev, &dev_stats); - spin_lock_bh(&iface_stat_list_lock); - entry = get_iface_entry(net_dev->name); - if (entry == NULL) { - IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n", - net_dev->name); - spin_unlock_bh(&iface_stat_list_lock); - return; - } - - IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, - net_dev->name, entry); - if (!entry->active) { - IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__, - net_dev->name); - spin_unlock_bh(&iface_stat_list_lock); - return; - } - - if (stash_only) { - entry->last_known[IFS_TX].bytes = stats->tx_bytes; - entry->last_known[IFS_TX].packets = stats->tx_packets; - entry->last_known[IFS_RX].bytes = stats->rx_bytes; - entry->last_known[IFS_RX].packets = stats->rx_packets; - entry->last_known_valid = true; - IF_DEBUG("qtaguid: %s(%s): " - "dev stats stashed rx/tx=%llu/%llu\n", __func__, - net_dev->name, stats->rx_bytes, stats->tx_bytes); - spin_unlock_bh(&iface_stat_list_lock); - return; - } - entry->totals_via_dev[IFS_TX].bytes += stats->tx_bytes; - entry->totals_via_dev[IFS_TX].packets += stats->tx_packets; - entry->totals_via_dev[IFS_RX].bytes += stats->rx_bytes; - entry->totals_via_dev[IFS_RX].packets += stats->rx_packets; - /* We don't need the last_known[] anymore */ - entry->last_known_valid = false; - _iface_stat_set_active(entry, net_dev, false); - IF_DEBUG("qtaguid: %s(%s): " - "disable tracking. rx/tx=%llu/%llu\n", __func__, - net_dev->name, stats->rx_bytes, stats->tx_bytes); - spin_unlock_bh(&iface_stat_list_lock); -} - -/* Guarantied to return a net_device that has a name */ -static void get_dev_and_dir(const struct sk_buff *skb, - struct xt_action_param *par, - enum ifs_tx_rx *direction, - const struct net_device **el_dev) -{ - BUG_ON(!direction || !el_dev); - - if (par->in) { - *el_dev = par->in; - *direction = IFS_RX; - } else if (par->out) { - *el_dev = par->out; - *direction = IFS_TX; - } else { - pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n", - par->hooknum, __func__); - BUG(); - } - if (skb->dev && *el_dev != skb->dev) { - MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n", - par->hooknum, skb->dev, skb->dev->name, - *direction == IFS_RX ? "in" : "out", *el_dev, - (*el_dev)->name); - } -} - -/* - * Update stats for the specified interface from the skb. - * Do nothing if the entry - * does not exist (when a device was never configured with an IP address). - * Called on each sk. - */ -static void iface_stat_update_from_skb(const struct sk_buff *skb, - struct xt_action_param *par) -{ - struct iface_stat *entry; - const struct net_device *el_dev; - enum ifs_tx_rx direction; - int bytes = skb->len; - int proto; - - get_dev_and_dir(skb, par, &direction, &el_dev); - proto = ipx_proto(skb, par); - MT_DEBUG("qtaguid[%d]: iface_stat: %s(%s): " - "type=%d fam=%d proto=%d dir=%d\n", - par->hooknum, __func__, el_dev->name, el_dev->type, - par->family, proto, direction); - - spin_lock_bh(&iface_stat_list_lock); - entry = get_iface_entry(el_dev->name); - if (entry == NULL) { - IF_DEBUG("qtaguid[%d]: iface_stat: %s(%s): not tracked\n", - par->hooknum, __func__, el_dev->name); - spin_unlock_bh(&iface_stat_list_lock); - return; - } - - IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", par->hooknum, __func__, - el_dev->name, entry); - - data_counters_update(&entry->totals_via_skb, 0, direction, proto, - bytes); - spin_unlock_bh(&iface_stat_list_lock); -} - -static void tag_stat_update(struct tag_stat *tag_entry, - enum ifs_tx_rx direction, int proto, int bytes) -{ - int active_set; - active_set = get_active_counter_set(tag_entry->tn.tag); - MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d " - "dir=%d proto=%d bytes=%d)\n", - tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag), - active_set, direction, proto, bytes); - data_counters_update(&tag_entry->counters, active_set, direction, - proto, bytes); - if (tag_entry->parent_counters) - data_counters_update(tag_entry->parent_counters, active_set, - direction, proto, bytes); -} - -/* - * Create a new entry for tracking the specified {acct_tag,uid_tag} within - * the interface. - * iface_entry->tag_stat_list_lock should be held. - */ -static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry, - tag_t tag) -{ - struct tag_stat *new_tag_stat_entry = NULL; - IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx" - " (uid=%u)\n", __func__, - iface_entry, tag, get_uid_from_tag(tag)); - new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC); - if (!new_tag_stat_entry) { - pr_err("qtaguid: iface_stat: tag stat alloc failed\n"); - goto done; - } - new_tag_stat_entry->tn.tag = tag; - tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree); -done: - return new_tag_stat_entry; -} - -static void if_tag_stat_update(const char *ifname, uid_t uid, - const struct sock *sk, enum ifs_tx_rx direction, - int proto, int bytes) -{ - struct tag_stat *tag_stat_entry; - tag_t tag, acct_tag; - tag_t uid_tag; - struct data_counters *uid_tag_counters; - struct sock_tag *sock_tag_entry; - struct iface_stat *iface_entry; - struct tag_stat *new_tag_stat = NULL; - MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s " - "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n", - ifname, uid, sk, direction, proto, bytes); - - spin_lock_bh(&iface_stat_list_lock); - iface_entry = get_iface_entry(ifname); - if (!iface_entry) { - pr_err_ratelimited("qtaguid: tag_stat: stat_update() " - "%s not found\n", ifname); - spin_unlock_bh(&iface_stat_list_lock); - return; - } - /* It is ok to process data when an iface_entry is inactive */ - - MT_DEBUG("qtaguid: tag_stat: stat_update() dev=%s entry=%p\n", - ifname, iface_entry); - - /* - * Look for a tagged sock. - * It will have an acct_uid. - */ - sock_tag_entry = get_sock_stat(sk); - if (sock_tag_entry) { - tag = sock_tag_entry->tag; - acct_tag = get_atag_from_tag(tag); - uid_tag = get_utag_from_tag(tag); - } else { - acct_tag = make_atag_from_value(0); - tag = combine_atag_with_uid(acct_tag, uid); - uid_tag = make_tag_from_uid(uid); - } - MT_DEBUG("qtaguid: tag_stat: stat_update(): " - " looking for tag=0x%llx (uid=%u) in ife=%p\n", - tag, get_uid_from_tag(tag), iface_entry); - /* Loop over tag list under this interface for {acct_tag,uid_tag} */ - spin_lock_bh(&iface_entry->tag_stat_list_lock); - - tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree, - tag); - if (tag_stat_entry) { - /* - * Updating the {acct_tag, uid_tag} entry handles both stats: - * {0, uid_tag} will also get updated. - */ - tag_stat_update(tag_stat_entry, direction, proto, bytes); - goto unlock; - } - - /* Loop over tag list under this interface for {0,uid_tag} */ - tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree, - uid_tag); - if (!tag_stat_entry) { - /* Here: the base uid_tag did not exist */ - /* - * No parent counters. So - * - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats. - */ - new_tag_stat = create_if_tag_stat(iface_entry, uid_tag); - if (!new_tag_stat) - goto unlock; - uid_tag_counters = &new_tag_stat->counters; - } else { - uid_tag_counters = &tag_stat_entry->counters; - } - - if (acct_tag) { - /* Create the child {acct_tag, uid_tag} and hook up parent. */ - new_tag_stat = create_if_tag_stat(iface_entry, tag); - if (!new_tag_stat) - goto unlock; - new_tag_stat->parent_counters = uid_tag_counters; - } else { - /* - * For new_tag_stat to be still NULL here would require: - * {0, uid_tag} exists - * and {acct_tag, uid_tag} doesn't exist - * AND acct_tag == 0. - * Impossible. This reassures us that new_tag_stat - * below will always be assigned. - */ - BUG_ON(!new_tag_stat); - } - tag_stat_update(new_tag_stat, direction, proto, bytes); -unlock: - spin_unlock_bh(&iface_entry->tag_stat_list_lock); - spin_unlock_bh(&iface_stat_list_lock); -} - -static int iface_netdev_event_handler(struct notifier_block *nb, - unsigned long event, void *ptr) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (unlikely(module_passive)) - return NOTIFY_DONE; - - IF_DEBUG("qtaguid: iface_stat: netdev_event(): " - "ev=0x%lx/%s netdev=%p->name=%s\n", - event, netdev_evt_str(event), dev, dev ? dev->name : ""); - - switch (event) { - case NETDEV_UP: - iface_stat_create(dev, NULL); - atomic64_inc(&qtu_events.iface_events); - break; - case NETDEV_DOWN: - case NETDEV_UNREGISTER: - iface_stat_update(dev, event == NETDEV_DOWN); - atomic64_inc(&qtu_events.iface_events); - break; - } - return NOTIFY_DONE; -} - -static int iface_inet6addr_event_handler(struct notifier_block *nb, - unsigned long event, void *ptr) -{ - struct inet6_ifaddr *ifa = ptr; - struct net_device *dev; - - if (unlikely(module_passive)) - return NOTIFY_DONE; - - IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): " - "ev=0x%lx/%s ifa=%p\n", - event, netdev_evt_str(event), ifa); - - switch (event) { - case NETDEV_UP: - BUG_ON(!ifa || !ifa->idev); - dev = (struct net_device *)ifa->idev->dev; - iface_stat_create_ipv6(dev, ifa); - atomic64_inc(&qtu_events.iface_events); - break; - case NETDEV_DOWN: - case NETDEV_UNREGISTER: - BUG_ON(!ifa || !ifa->idev); - dev = (struct net_device *)ifa->idev->dev; - iface_stat_update(dev, event == NETDEV_DOWN); - atomic64_inc(&qtu_events.iface_events); - break; - } - return NOTIFY_DONE; -} - -static int iface_inetaddr_event_handler(struct notifier_block *nb, - unsigned long event, void *ptr) -{ - struct in_ifaddr *ifa = ptr; - struct net_device *dev; - - if (unlikely(module_passive)) - return NOTIFY_DONE; - - IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): " - "ev=0x%lx/%s ifa=%p\n", - event, netdev_evt_str(event), ifa); - - switch (event) { - case NETDEV_UP: - BUG_ON(!ifa || !ifa->ifa_dev); - dev = ifa->ifa_dev->dev; - iface_stat_create(dev, ifa); - atomic64_inc(&qtu_events.iface_events); - break; - case NETDEV_DOWN: - case NETDEV_UNREGISTER: - BUG_ON(!ifa || !ifa->ifa_dev); - dev = ifa->ifa_dev->dev; - iface_stat_update(dev, event == NETDEV_DOWN); - atomic64_inc(&qtu_events.iface_events); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block iface_netdev_notifier_blk = { - .notifier_call = iface_netdev_event_handler, -}; - -static struct notifier_block iface_inetaddr_notifier_blk = { - .notifier_call = iface_inetaddr_event_handler, -}; - -static struct notifier_block iface_inet6addr_notifier_blk = { - .notifier_call = iface_inet6addr_event_handler, -}; - -static const struct seq_operations iface_stat_fmt_proc_seq_ops = { - .start = iface_stat_fmt_proc_start, - .next = iface_stat_fmt_proc_next, - .stop = iface_stat_fmt_proc_stop, - .show = iface_stat_fmt_proc_show, -}; - -static int proc_iface_stat_fmt_open(struct inode *inode, struct file *file) -{ - struct proc_iface_stat_fmt_info *s; - - s = __seq_open_private(file, &iface_stat_fmt_proc_seq_ops, - sizeof(struct proc_iface_stat_fmt_info)); - if (!s) - return -ENOMEM; - - s->fmt = (uintptr_t)PDE_DATA(inode); - return 0; -} - -static const struct file_operations proc_iface_stat_fmt_fops = { - .open = proc_iface_stat_fmt_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -static int __init iface_stat_init(struct proc_dir_entry *parent_procdir) -{ - int err; - - iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir); - if (!iface_stat_procdir) { - pr_err("qtaguid: iface_stat: init failed to create proc entry\n"); - err = -1; - goto err; - } - - iface_stat_all_procfile = proc_create_data(iface_stat_all_procfilename, - proc_iface_perms, - parent_procdir, - &proc_iface_stat_fmt_fops, - (void *)1 /* fmt1 */); - if (!iface_stat_all_procfile) { - pr_err("qtaguid: iface_stat: init " - " failed to create stat_old proc entry\n"); - err = -1; - goto err_zap_entry; - } - - iface_stat_fmt_procfile = proc_create_data(iface_stat_fmt_procfilename, - proc_iface_perms, - parent_procdir, - &proc_iface_stat_fmt_fops, - (void *)2 /* fmt2 */); - if (!iface_stat_fmt_procfile) { - pr_err("qtaguid: iface_stat: init " - " failed to create stat_all proc entry\n"); - err = -1; - goto err_zap_all_stats_entry; - } - - - err = register_netdevice_notifier(&iface_netdev_notifier_blk); - if (err) { - pr_err("qtaguid: iface_stat: init " - "failed to register dev event handler\n"); - goto err_zap_all_stats_entries; - } - err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk); - if (err) { - pr_err("qtaguid: iface_stat: init " - "failed to register ipv4 dev event handler\n"); - goto err_unreg_nd; - } - - err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk); - if (err) { - pr_err("qtaguid: iface_stat: init " - "failed to register ipv6 dev event handler\n"); - goto err_unreg_ip4_addr; - } - return 0; - -err_unreg_ip4_addr: - unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk); -err_unreg_nd: - unregister_netdevice_notifier(&iface_netdev_notifier_blk); -err_zap_all_stats_entries: - remove_proc_entry(iface_stat_fmt_procfilename, parent_procdir); -err_zap_all_stats_entry: - remove_proc_entry(iface_stat_all_procfilename, parent_procdir); -err_zap_entry: - remove_proc_entry(iface_stat_procdirname, parent_procdir); -err: - return err; -} - -static struct sock *qtaguid_find_sk(const struct sk_buff *skb, - struct xt_action_param *par) -{ - struct sock *sk; - unsigned int hook_mask = (1 << par->hooknum); - - MT_DEBUG("qtaguid[%d]: find_sk(skb=%p) family=%d\n", - par->hooknum, skb, par->family); - - /* - * Let's not abuse the the xt_socket_get*_sk(), or else it will - * return garbage SKs. - */ - if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS)) - return NULL; - - switch (par->family) { - case NFPROTO_IPV6: - sk = xt_socket_lookup_slow_v6(dev_net(skb->dev), skb, par->in); - break; - case NFPROTO_IPV4: - sk = xt_socket_lookup_slow_v4(dev_net(skb->dev), skb, par->in); - break; - default: - return NULL; - } - - if (sk) { - MT_DEBUG("qtaguid[%d]: %p->sk_proto=%u->sk_state=%d\n", - par->hooknum, sk, sk->sk_protocol, sk->sk_state); - /* - * When in TCP_TIME_WAIT the sk is not a "struct sock" but - * "struct inet_timewait_sock" which is missing fields. - */ - if (!sk_fullsock(sk) || sk->sk_state == TCP_TIME_WAIT) { - sock_gen_put(sk); - sk = NULL; - } - } - return sk; -} - -static void account_for_uid(const struct sk_buff *skb, - const struct sock *alternate_sk, uid_t uid, - struct xt_action_param *par) -{ - const struct net_device *el_dev; - enum ifs_tx_rx direction; - int proto; - - get_dev_and_dir(skb, par, &direction, &el_dev); - proto = ipx_proto(skb, par); - MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d dir=%d\n", - par->hooknum, el_dev->name, el_dev->type, - par->family, proto, direction); - - if_tag_stat_update(el_dev->name, uid, - skb->sk ? skb->sk : alternate_sk, - direction, - proto, skb->len); -} - -static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) -{ - const struct xt_qtaguid_match_info *info = par->matchinfo; - const struct file *filp; - bool got_sock = false; - struct sock *sk; - kuid_t sock_uid; - bool res; - bool set_sk_callback_lock = false; - /* - * TODO: unhack how to force just accounting. - * For now we only do tag stats when the uid-owner is not requested - */ - bool do_tag_stat = !(info->match & XT_QTAGUID_UID); - - if (unlikely(module_passive)) - return (info->match ^ info->invert) == 0; - - MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n", - par->hooknum, skb, par->in, par->out, par->family); - - atomic64_inc(&qtu_events.match_calls); - if (skb == NULL) { - res = (info->match ^ info->invert) == 0; - goto ret_res; - } - - switch (par->hooknum) { - case NF_INET_PRE_ROUTING: - case NF_INET_POST_ROUTING: - atomic64_inc(&qtu_events.match_calls_prepost); - iface_stat_update_from_skb(skb, par); - /* - * We are done in pre/post. The skb will get processed - * further alter. - */ - res = (info->match ^ info->invert); - goto ret_res; - break; - /* default: Fall through and do UID releated work */ - } - - sk = skb_to_full_sk(skb); - /* - * When in TCP_TIME_WAIT the sk is not a "struct sock" but - * "struct inet_timewait_sock" which is missing fields. - * So we ignore it. - */ - if (sk && sk->sk_state == TCP_TIME_WAIT) - sk = NULL; - if (sk == NULL) { - /* - * A missing sk->sk_socket happens when packets are in-flight - * and the matching socket is already closed and gone. - */ - sk = qtaguid_find_sk(skb, par); - /* - * If we got the socket from the find_sk(), we will need to put - * it back, as nf_tproxy_get_sock_v4() got it. - */ - got_sock = sk; - if (sk) - atomic64_inc(&qtu_events.match_found_sk_in_ct); - else - atomic64_inc(&qtu_events.match_found_no_sk_in_ct); - } else { - atomic64_inc(&qtu_events.match_found_sk); - } - MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", - par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); - - if (!sk) { - /* - * Here, the qtaguid_find_sk() using connection tracking - * couldn't find the owner, so for now we just count them - * against the system. - */ - if (do_tag_stat) - account_for_uid(skb, sk, 0, par); - MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum); - res = (info->match ^ info->invert) == 0; - atomic64_inc(&qtu_events.match_no_sk); - goto put_sock_ret_res; - } else if (info->match & info->invert & XT_QTAGUID_SOCKET) { - res = false; - goto put_sock_ret_res; - } - sock_uid = sk->sk_uid; - if (do_tag_stat) - account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), - par); - - /* - * The following two tests fail the match when: - * id not in range AND no inverted condition requested - * or id in range AND inverted condition requested - * Thus (!a && b) || (a && !b) == a ^ b - */ - if (info->match & XT_QTAGUID_UID) { - kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); - kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); - - if ((uid_gte(sock_uid, uid_min) && - uid_lte(sock_uid, uid_max)) ^ - !(info->invert & XT_QTAGUID_UID)) { - MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", - par->hooknum); - res = false; - goto put_sock_ret_res; - } - } - if (info->match & XT_QTAGUID_GID) { - kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); - kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); - set_sk_callback_lock = true; - read_lock_bh(&sk->sk_callback_lock); - MT_DEBUG("qtaguid[%d]: sk=%pK->sk_socket=%pK->file=%pK\n", - par->hooknum, sk, sk->sk_socket, - sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); - filp = sk->sk_socket ? sk->sk_socket->file : NULL; - if (!filp) { - res = ((info->match ^ info->invert) & - XT_QTAGUID_GID) == 0; - atomic64_inc(&qtu_events.match_no_sk_gid); - goto put_sock_ret_res; - } - MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", - par->hooknum, filp ? - from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); - if ((gid_gte(filp->f_cred->fsgid, gid_min) && - gid_lte(filp->f_cred->fsgid, gid_max)) ^ - !(info->invert & XT_QTAGUID_GID)) { - MT_DEBUG("qtaguid[%d]: leaving gid not matching\n", - par->hooknum); - res = false; - goto put_sock_ret_res; - } - } - MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum); - res = true; - -put_sock_ret_res: - if (got_sock) - sock_gen_put(sk); - if (set_sk_callback_lock) - read_unlock_bh(&sk->sk_callback_lock); -ret_res: - MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res); - return res; -} - -#ifdef DDEBUG -/* - * This function is not in xt_qtaguid_print.c because of locks visibility. - * The lock of sock_tag_list must be aquired before calling this function - */ -static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) -{ - va_list args; - char *fmt_buff; - char *buff; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - fmt_buff = kasprintf(GFP_ATOMIC, - "qtaguid: %s(): %s {\n", __func__, fmt); - BUG_ON(!fmt_buff); - va_start(args, fmt); - buff = kvasprintf(GFP_ATOMIC, - fmt_buff, args); - BUG_ON(!buff); - pr_debug("%s", buff); - kfree(fmt_buff); - kfree(buff); - va_end(args); - - prdebug_sock_tag_tree(indent_level, &sock_tag_tree); - - spin_lock_bh(&uid_tag_data_tree_lock); - prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree); - prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree); - spin_unlock_bh(&uid_tag_data_tree_lock); - - spin_lock_bh(&iface_stat_list_lock); - prdebug_iface_stat_list(indent_level, &iface_stat_list); - spin_unlock_bh(&iface_stat_list_lock); - - pr_debug("qtaguid: %s(): }\n", __func__); -} -#else -static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) {} -#endif - -struct proc_ctrl_print_info { - struct sock *sk; /* socket found by reading to sk_pos */ - loff_t sk_pos; -}; - -static void *qtaguid_ctrl_proc_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct proc_ctrl_print_info *pcpi = m->private; - struct sock_tag *sock_tag_entry = v; - struct rb_node *node; - - (*pos)++; - - if (!v || v == SEQ_START_TOKEN) - return NULL; - - node = rb_next(&sock_tag_entry->sock_node); - if (!node) { - pcpi->sk = NULL; - sock_tag_entry = SEQ_START_TOKEN; - } else { - sock_tag_entry = rb_entry(node, struct sock_tag, sock_node); - pcpi->sk = sock_tag_entry->sk; - } - pcpi->sk_pos = *pos; - return sock_tag_entry; -} - -static void *qtaguid_ctrl_proc_start(struct seq_file *m, loff_t *pos) -{ - struct proc_ctrl_print_info *pcpi = m->private; - struct sock_tag *sock_tag_entry; - struct rb_node *node; - - spin_lock_bh(&sock_tag_list_lock); - - if (unlikely(module_passive)) - return NULL; - - if (*pos == 0) { - pcpi->sk_pos = 0; - node = rb_first(&sock_tag_tree); - if (!node) { - pcpi->sk = NULL; - return SEQ_START_TOKEN; - } - sock_tag_entry = rb_entry(node, struct sock_tag, sock_node); - pcpi->sk = sock_tag_entry->sk; - } else { - sock_tag_entry = (pcpi->sk ? get_sock_stat_nl(pcpi->sk) : - NULL) ?: SEQ_START_TOKEN; - if (*pos != pcpi->sk_pos) { - /* seq_read skipped a next call */ - *pos = pcpi->sk_pos; - return qtaguid_ctrl_proc_next(m, sock_tag_entry, pos); - } - } - return sock_tag_entry; -} - -static void qtaguid_ctrl_proc_stop(struct seq_file *m, void *v) -{ - spin_unlock_bh(&sock_tag_list_lock); -} - -/* - * Procfs reader to get all active socket tags using style "1)" as described in - * fs/proc/generic.c - */ -static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) -{ - struct sock_tag *sock_tag_entry = v; - uid_t uid; - - CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u\n", - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - - if (sock_tag_entry != SEQ_START_TOKEN) { - int sk_ref_count; - uid = get_uid_from_tag(sock_tag_entry->tag); - CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) " - "pid=%u\n", - sock_tag_entry->sk, - sock_tag_entry->tag, - uid, - sock_tag_entry->pid - ); - sk_ref_count = atomic_read( - &sock_tag_entry->sk->sk_refcnt); - seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u " - "f_count=%d\n", - sock_tag_entry->sk, - sock_tag_entry->tag, uid, - sock_tag_entry->pid, sk_ref_count); - } else { - seq_printf(m, "events: sockets_tagged=%llu " - "sockets_untagged=%llu " - "counter_set_changes=%llu " - "delete_cmds=%llu " - "iface_events=%llu " - "match_calls=%llu " - "match_calls_prepost=%llu " - "match_found_sk=%llu " - "match_found_sk_in_ct=%llu " - "match_found_no_sk_in_ct=%llu " - "match_no_sk=%llu " - "match_no_sk_gid=%llu\n", - (u64)atomic64_read(&qtu_events.sockets_tagged), - (u64)atomic64_read(&qtu_events.sockets_untagged), - (u64)atomic64_read(&qtu_events.counter_set_changes), - (u64)atomic64_read(&qtu_events.delete_cmds), - (u64)atomic64_read(&qtu_events.iface_events), - (u64)atomic64_read(&qtu_events.match_calls), - (u64)atomic64_read(&qtu_events.match_calls_prepost), - (u64)atomic64_read(&qtu_events.match_found_sk), - (u64)atomic64_read(&qtu_events.match_found_sk_in_ct), - (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct), - (u64)atomic64_read(&qtu_events.match_no_sk), - (u64)atomic64_read(&qtu_events.match_no_sk_gid)); - - /* Count the following as part of the last item_index. No need - * to lock the sock_tag_list here since it is already locked when - * starting the seq_file operation - */ - prdebug_full_state_locked(0, "proc ctrl"); - } - - return 0; -} - -/* - * Delete socket tags, and stat tags associated with a given - * accouting tag and uid. - */ -static int ctrl_cmd_delete(const char *input) -{ - char cmd; - int uid_int; - kuid_t uid; - uid_t entry_uid; - tag_t acct_tag; - tag_t tag; - int res, argc; - struct iface_stat *iface_entry; - struct rb_node *node; - struct sock_tag *st_entry; - struct rb_root st_to_free_tree = RB_ROOT; - struct tag_stat *ts_entry; - struct tag_counter_set *tcs_entry; - struct tag_ref *tr_entry; - struct uid_tag_data *utd_entry; - - argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid_int); - uid = make_kuid(&init_user_ns, uid_int); - CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c " - "user_tag=0x%llx uid=%u\n", input, argc, cmd, - acct_tag, uid_int); - if (argc < 2) { - res = -EINVAL; - goto err; - } - if (!valid_atag(acct_tag)) { - pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input); - res = -EINVAL; - goto err; - } - if (argc < 3) { - uid = current_fsuid(); - uid_int = from_kuid(&init_user_ns, uid); - } else if (!can_impersonate_uid(uid)) { - pr_info("qtaguid: ctrl_delete(%s): " - "insufficient priv from pid=%u tgid=%u uid=%u\n", - input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - res = -EPERM; - goto err; - } - - tag = combine_atag_with_uid(acct_tag, uid_int); - CT_DEBUG("qtaguid: ctrl_delete(%s): " - "looking for tag=0x%llx (uid=%u)\n", - input, tag, uid_int); - - /* Delete socket tags */ - spin_lock_bh(&sock_tag_list_lock); - spin_lock_bh(&uid_tag_data_tree_lock); - node = rb_first(&sock_tag_tree); - while (node) { - st_entry = rb_entry(node, struct sock_tag, sock_node); - entry_uid = get_uid_from_tag(st_entry->tag); - node = rb_next(node); - if (entry_uid != uid_int) - continue; - - CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n", - input, st_entry->tag, entry_uid); - - if (!acct_tag || st_entry->tag == tag) { - rb_erase(&st_entry->sock_node, &sock_tag_tree); - /* Can't sockfd_put() within spinlock, do it later. */ - sock_tag_tree_insert(st_entry, &st_to_free_tree); - tr_entry = lookup_tag_ref(st_entry->tag, NULL); - BUG_ON(tr_entry->num_sock_tags <= 0); - tr_entry->num_sock_tags--; - /* - * TODO: remove if, and start failing. - * This is a hack to work around the fact that in some - * places we have "if (IS_ERR_OR_NULL(pqd_entry))" - * and are trying to work around apps - * that didn't open the /dev/xt_qtaguid. - */ - if (st_entry->list.next && st_entry->list.prev) - list_del(&st_entry->list); - } - } - spin_unlock_bh(&uid_tag_data_tree_lock); - spin_unlock_bh(&sock_tag_list_lock); - - sock_tag_tree_erase(&st_to_free_tree); - - /* Delete tag counter-sets */ - spin_lock_bh(&tag_counter_set_list_lock); - /* Counter sets are only on the uid tag, not full tag */ - tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag); - if (tcs_entry) { - CT_DEBUG("qtaguid: ctrl_delete(%s): " - "erase tcs: tag=0x%llx (uid=%u) set=%d\n", - input, - tcs_entry->tn.tag, - get_uid_from_tag(tcs_entry->tn.tag), - tcs_entry->active_set); - rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree); - kfree(tcs_entry); - } - spin_unlock_bh(&tag_counter_set_list_lock); - - /* - * If acct_tag is 0, then all entries belonging to uid are - * erased. - */ - spin_lock_bh(&iface_stat_list_lock); - list_for_each_entry(iface_entry, &iface_stat_list, list) { - spin_lock_bh(&iface_entry->tag_stat_list_lock); - node = rb_first(&iface_entry->tag_stat_tree); - while (node) { - ts_entry = rb_entry(node, struct tag_stat, tn.node); - entry_uid = get_uid_from_tag(ts_entry->tn.tag); - node = rb_next(node); - - CT_DEBUG("qtaguid: ctrl_delete(%s): " - "ts tag=0x%llx (uid=%u)\n", - input, ts_entry->tn.tag, entry_uid); - - if (entry_uid != uid_int) - continue; - if (!acct_tag || ts_entry->tn.tag == tag) { - CT_DEBUG("qtaguid: ctrl_delete(%s): " - "erase ts: %s 0x%llx %u\n", - input, iface_entry->ifname, - get_atag_from_tag(ts_entry->tn.tag), - entry_uid); - rb_erase(&ts_entry->tn.node, - &iface_entry->tag_stat_tree); - kfree(ts_entry); - } - } - spin_unlock_bh(&iface_entry->tag_stat_list_lock); - } - spin_unlock_bh(&iface_stat_list_lock); - - /* Cleanup the uid_tag_data */ - spin_lock_bh(&uid_tag_data_tree_lock); - node = rb_first(&uid_tag_data_tree); - while (node) { - utd_entry = rb_entry(node, struct uid_tag_data, node); - entry_uid = utd_entry->uid; - node = rb_next(node); - - CT_DEBUG("qtaguid: ctrl_delete(%s): " - "utd uid=%u\n", - input, entry_uid); - - if (entry_uid != uid_int) - continue; - /* - * Go over the tag_refs, and those that don't have - * sock_tags using them are freed. - */ - put_tag_ref_tree(tag, utd_entry); - put_utd_entry(utd_entry); - } - spin_unlock_bh(&uid_tag_data_tree_lock); - - atomic64_inc(&qtu_events.delete_cmds); - res = 0; - -err: - return res; -} - -static int ctrl_cmd_counter_set(const char *input) -{ - char cmd; - uid_t uid = 0; - tag_t tag; - int res, argc; - struct tag_counter_set *tcs; - int counter_set; - - argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid); - CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c " - "set=%d uid=%u\n", input, argc, cmd, - counter_set, uid); - if (argc != 3) { - res = -EINVAL; - goto err; - } - if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) { - pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n", - input); - res = -EINVAL; - goto err; - } - if (!can_manipulate_uids()) { - pr_info("qtaguid: ctrl_counterset(%s): " - "insufficient priv from pid=%u tgid=%u uid=%u\n", - input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - res = -EPERM; - goto err; - } - - tag = make_tag_from_uid(uid); - spin_lock_bh(&tag_counter_set_list_lock); - tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag); - if (!tcs) { - tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC); - if (!tcs) { - spin_unlock_bh(&tag_counter_set_list_lock); - pr_err("qtaguid: ctrl_counterset(%s): " - "failed to alloc counter set\n", - input); - res = -ENOMEM; - goto err; - } - tcs->tn.tag = tag; - tag_counter_set_tree_insert(tcs, &tag_counter_set_tree); - CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx " - "(uid=%u) set=%d\n", - input, tag, get_uid_from_tag(tag), counter_set); - } - tcs->active_set = counter_set; - spin_unlock_bh(&tag_counter_set_list_lock); - atomic64_inc(&qtu_events.counter_set_changes); - res = 0; - -err: - return res; -} - -static int ctrl_cmd_tag(const char *input) -{ - char cmd; - int sock_fd = 0; - kuid_t uid; - unsigned int uid_int = 0; - tag_t acct_tag = make_atag_from_value(0); - tag_t full_tag; - struct socket *el_socket; - int res, argc; - struct sock_tag *sock_tag_entry; - struct tag_ref *tag_ref_entry; - struct uid_tag_data *uid_tag_data_entry; - struct proc_qtu_data *pqd_entry; - - /* Unassigned args will get defaulted later. */ - argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid_int); - uid = make_kuid(&init_user_ns, uid_int); - CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d " - "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd, - acct_tag, uid_int); - if (argc < 2) { - res = -EINVAL; - goto err; - } - el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */ - if (!el_socket) { - pr_info("qtaguid: ctrl_tag(%s): failed to lookup" - " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n", - input, sock_fd, res, current->pid, current->tgid, - from_kuid(&init_user_ns, current_fsuid())); - goto err; - } - CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n", - input, atomic_read(&el_socket->sk->sk_refcnt), - el_socket->sk); - if (argc < 3) { - acct_tag = make_atag_from_value(0); - } else if (!valid_atag(acct_tag)) { - pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input); - res = -EINVAL; - goto err_put; - } - CT_DEBUG("qtaguid: ctrl_tag(%s): " - "pid=%u tgid=%u uid=%u euid=%u fsuid=%u " - "ctrl.gid=%u in_group()=%d in_egroup()=%d\n", - input, current->pid, current->tgid, - from_kuid(&init_user_ns, current_uid()), - from_kuid(&init_user_ns, current_euid()), - from_kuid(&init_user_ns, current_fsuid()), - from_kgid(&init_user_ns, xt_qtaguid_ctrl_file->gid), - in_group_p(xt_qtaguid_ctrl_file->gid), - in_egroup_p(xt_qtaguid_ctrl_file->gid)); - if (argc < 4) { - uid = current_fsuid(); - uid_int = from_kuid(&init_user_ns, uid); - } else if (!can_impersonate_uid(uid)) { - pr_info("qtaguid: ctrl_tag(%s): " - "insufficient priv from pid=%u tgid=%u uid=%u\n", - input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - res = -EPERM; - goto err_put; - } - full_tag = combine_atag_with_uid(acct_tag, uid_int); - - spin_lock_bh(&sock_tag_list_lock); - spin_lock_bh(&uid_tag_data_tree_lock); - sock_tag_entry = get_sock_stat_nl(el_socket->sk); - tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry); - if (IS_ERR(tag_ref_entry)) { - res = PTR_ERR(tag_ref_entry); - spin_unlock_bh(&uid_tag_data_tree_lock); - spin_unlock_bh(&sock_tag_list_lock); - goto err_put; - } - tag_ref_entry->num_sock_tags++; - if (sock_tag_entry) { - struct tag_ref *prev_tag_ref_entry; - - CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p " - "st@%p ...->sk_refcnt=%d\n", - input, el_socket->sk, sock_tag_entry, - atomic_read(&el_socket->sk->sk_refcnt)); - prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, - &uid_tag_data_entry); - BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry)); - BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0); - prev_tag_ref_entry->num_sock_tags--; - sock_tag_entry->tag = full_tag; - } else { - CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n", - input, el_socket->sk); - sock_tag_entry = kzalloc(sizeof(*sock_tag_entry), - GFP_ATOMIC); - if (!sock_tag_entry) { - pr_err("qtaguid: ctrl_tag(%s): " - "socket tag alloc failed\n", - input); - BUG_ON(tag_ref_entry->num_sock_tags <= 0); - tag_ref_entry->num_sock_tags--; - free_tag_ref_from_utd_entry(tag_ref_entry, - uid_tag_data_entry); - spin_unlock_bh(&uid_tag_data_tree_lock); - spin_unlock_bh(&sock_tag_list_lock); - res = -ENOMEM; - goto err_put; - } - /* - * Hold the sk refcount here to make sure the sk pointer cannot - * be freed and reused - */ - sock_hold(el_socket->sk); - sock_tag_entry->sk = el_socket->sk; - sock_tag_entry->pid = current->tgid; - sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int); - pqd_entry = proc_qtu_data_tree_search( - &proc_qtu_data_tree, current->tgid); - /* - * TODO: remove if, and start failing. - * At first, we want to catch user-space code that is not - * opening the /dev/xt_qtaguid. - */ - if (IS_ERR_OR_NULL(pqd_entry)) - pr_warn_once( - "qtaguid: %s(): " - "User space forgot to open /dev/xt_qtaguid? " - "pid=%u tgid=%u uid=%u\n", __func__, - current->pid, current->tgid, - from_kuid(&init_user_ns, current_fsuid())); - else - list_add(&sock_tag_entry->list, - &pqd_entry->sock_tag_list); - - sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree); - atomic64_inc(&qtu_events.sockets_tagged); - } - spin_unlock_bh(&uid_tag_data_tree_lock); - spin_unlock_bh(&sock_tag_list_lock); - /* We keep the ref to the sk until it is untagged */ - CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n", - input, sock_tag_entry, - atomic_read(&el_socket->sk->sk_refcnt)); - sockfd_put(el_socket); - return 0; - -err_put: - CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n", - input, atomic_read(&el_socket->sk->sk_refcnt) - 1); - /* Release the sock_fd that was grabbed by sockfd_lookup(). */ - sockfd_put(el_socket); - return res; - -err: - CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input); - return res; -} - -static int ctrl_cmd_untag(const char *input) -{ - char cmd; - int sock_fd = 0; - struct socket *el_socket; - int res, argc; - - argc = sscanf(input, "%c %d", &cmd, &sock_fd); - CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n", - input, argc, cmd, sock_fd); - if (argc < 2) { - res = -EINVAL; - return res; - } - el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */ - if (!el_socket) { - pr_info("qtaguid: ctrl_untag(%s): failed to lookup" - " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n", - input, sock_fd, res, current->pid, current->tgid, - from_kuid(&init_user_ns, current_fsuid())); - return res; - } - CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n", - input, atomic_long_read(&el_socket->file->f_count), - el_socket->sk); - res = qtaguid_untag(el_socket, false); - sockfd_put(el_socket); - return res; -} - -int qtaguid_untag(struct socket *el_socket, bool kernel) -{ - int res; - pid_t pid; - struct sock_tag *sock_tag_entry; - struct tag_ref *tag_ref_entry; - struct uid_tag_data *utd_entry; - struct proc_qtu_data *pqd_entry; - - spin_lock_bh(&sock_tag_list_lock); - sock_tag_entry = get_sock_stat_nl(el_socket->sk); - if (!sock_tag_entry) { - spin_unlock_bh(&sock_tag_list_lock); - res = -EINVAL; - return res; - } - /* - * The socket already belongs to the current process - * so it can do whatever it wants to it. - */ - rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree); - - tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry); - BUG_ON(!tag_ref_entry); - BUG_ON(tag_ref_entry->num_sock_tags <= 0); - spin_lock_bh(&uid_tag_data_tree_lock); - if (kernel) - pid = sock_tag_entry->pid; - else - pid = current->tgid; - pqd_entry = proc_qtu_data_tree_search( - &proc_qtu_data_tree, pid); - /* - * TODO: remove if, and start failing. - * At first, we want to catch user-space code that is not - * opening the /dev/xt_qtaguid. - */ - if (IS_ERR_OR_NULL(pqd_entry) || !sock_tag_entry->list.next) { - pr_warn_once("qtaguid: %s(): " - "User space forgot to open /dev/xt_qtaguid? " - "pid=%u tgid=%u sk_pid=%u, uid=%u\n", __func__, - current->pid, current->tgid, sock_tag_entry->pid, - from_kuid(&init_user_ns, current_fsuid())); - } else { - list_del(&sock_tag_entry->list); - } - spin_unlock_bh(&uid_tag_data_tree_lock); - /* - * We don't free tag_ref from the utd_entry here, - * only during a cmd_delete(). - */ - tag_ref_entry->num_sock_tags--; - spin_unlock_bh(&sock_tag_list_lock); - /* - * Release the sock_fd that was grabbed at tag time. - */ - sock_put(sock_tag_entry->sk); - CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n", - sock_tag_entry, - atomic_read(&el_socket->sk->sk_refcnt)); - - kfree(sock_tag_entry); - atomic64_inc(&qtu_events.sockets_untagged); - - return 0; -} - -static ssize_t qtaguid_ctrl_parse(const char *input, size_t count) -{ - char cmd; - ssize_t res; - - CT_DEBUG("qtaguid: ctrl(%s): pid=%u tgid=%u uid=%u\n", - input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - - cmd = input[0]; - /* Collect params for commands */ - switch (cmd) { - case 'd': - res = ctrl_cmd_delete(input); - break; - - case 's': - res = ctrl_cmd_counter_set(input); - break; - - case 't': - res = ctrl_cmd_tag(input); - break; - - case 'u': - res = ctrl_cmd_untag(input); - break; - - default: - res = -EINVAL; - goto err; - } - if (!res) - res = count; -err: - CT_DEBUG("qtaguid: ctrl(%s): res=%zd\n", input, res); - return res; -} - -#define MAX_QTAGUID_CTRL_INPUT_LEN 255 -static ssize_t qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer, - size_t count, loff_t *offp) -{ - char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN]; - - if (unlikely(module_passive)) - return count; - - if (count >= MAX_QTAGUID_CTRL_INPUT_LEN) - return -EINVAL; - - if (copy_from_user(input_buf, buffer, count)) - return -EFAULT; - - input_buf[count] = '\0'; - return qtaguid_ctrl_parse(input_buf, count); -} - -struct proc_print_info { - struct iface_stat *iface_entry; - int item_index; - tag_t tag; /* tag found by reading to tag_pos */ - off_t tag_pos; - int tag_item_index; -}; - -static void pp_stats_header(struct seq_file *m) -{ - seq_puts(m, - "idx iface acct_tag_hex uid_tag_int cnt_set " - "rx_bytes rx_packets " - "tx_bytes tx_packets " - "rx_tcp_bytes rx_tcp_packets " - "rx_udp_bytes rx_udp_packets " - "rx_other_bytes rx_other_packets " - "tx_tcp_bytes tx_tcp_packets " - "tx_udp_bytes tx_udp_packets " - "tx_other_bytes tx_other_packets\n"); -} - -static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, - int cnt_set) -{ - struct data_counters *cnts; - tag_t tag = ts_entry->tn.tag; - uid_t stat_uid = get_uid_from_tag(tag); - struct proc_print_info *ppi = m->private; - /* Detailed tags are not available to everybody */ - if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) { - CT_DEBUG("qtaguid: stats line: " - "%s 0x%llx %u: insufficient priv " - "from pid=%u tgid=%u uid=%u stats.gid=%u\n", - ppi->iface_entry->ifname, - get_atag_from_tag(tag), stat_uid, - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()), - from_kgid(&init_user_ns,xt_qtaguid_stats_file->gid)); - return 0; - } - ppi->item_index++; - cnts = &ts_entry->counters; - seq_printf(m, "%d %s 0x%llx %u %u " - "%llu %llu " - "%llu %llu " - "%llu %llu " - "%llu %llu " - "%llu %llu " - "%llu %llu " - "%llu %llu " - "%llu %llu\n", - ppi->item_index, - ppi->iface_entry->ifname, - get_atag_from_tag(tag), - stat_uid, - cnt_set, - dc_sum_bytes(cnts, cnt_set, IFS_RX), - dc_sum_packets(cnts, cnt_set, IFS_RX), - dc_sum_bytes(cnts, cnt_set, IFS_TX), - dc_sum_packets(cnts, cnt_set, IFS_TX), - cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes, - cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets, - cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes, - cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets, - cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes, - cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets, - cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes, - cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets, - cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes, - cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, - cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, - cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); - return seq_has_overflowed(m) ? -ENOSPC : 1; -} - -static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry) -{ - int ret; - int counter_set; - for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS; - counter_set++) { - ret = pp_stats_line(m, ts_entry, counter_set); - if (ret < 0) - return false; - } - return true; -} - -static int qtaguid_stats_proc_iface_stat_ptr_valid(struct iface_stat *ptr) -{ - struct iface_stat *iface_entry; - - if (!ptr) - return false; - - list_for_each_entry(iface_entry, &iface_stat_list, list) - if (iface_entry == ptr) - return true; - return false; -} - -static void qtaguid_stats_proc_next_iface_entry(struct proc_print_info *ppi) -{ - spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock); - list_for_each_entry_continue(ppi->iface_entry, &iface_stat_list, list) { - spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock); - return; - } - ppi->iface_entry = NULL; -} - -static void *qtaguid_stats_proc_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct proc_print_info *ppi = m->private; - struct tag_stat *ts_entry; - struct rb_node *node; - - if (!v) { - pr_err("qtaguid: %s(): unexpected v: NULL\n", __func__); - return NULL; - } - - (*pos)++; - - if (!ppi->iface_entry || unlikely(module_passive)) - return NULL; - - if (v == SEQ_START_TOKEN) - node = rb_first(&ppi->iface_entry->tag_stat_tree); - else - node = rb_next(&((struct tag_stat *)v)->tn.node); - - while (!node) { - qtaguid_stats_proc_next_iface_entry(ppi); - if (!ppi->iface_entry) - return NULL; - node = rb_first(&ppi->iface_entry->tag_stat_tree); - } - - ts_entry = rb_entry(node, struct tag_stat, tn.node); - ppi->tag = ts_entry->tn.tag; - ppi->tag_pos = *pos; - ppi->tag_item_index = ppi->item_index; - return ts_entry; -} - -static void *qtaguid_stats_proc_start(struct seq_file *m, loff_t *pos) -{ - struct proc_print_info *ppi = m->private; - struct tag_stat *ts_entry = NULL; - - spin_lock_bh(&iface_stat_list_lock); - - if (*pos == 0) { - ppi->item_index = 1; - ppi->tag_pos = 0; - if (list_empty(&iface_stat_list)) { - ppi->iface_entry = NULL; - } else { - ppi->iface_entry = list_first_entry(&iface_stat_list, - struct iface_stat, - list); - spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock); - } - return SEQ_START_TOKEN; - } - if (!qtaguid_stats_proc_iface_stat_ptr_valid(ppi->iface_entry)) { - if (ppi->iface_entry) { - pr_err("qtaguid: %s(): iface_entry %p not found\n", - __func__, ppi->iface_entry); - ppi->iface_entry = NULL; - } - return NULL; - } - - spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock); - - if (!ppi->tag_pos) { - /* seq_read skipped first next call */ - ts_entry = SEQ_START_TOKEN; - } else { - ts_entry = tag_stat_tree_search( - &ppi->iface_entry->tag_stat_tree, ppi->tag); - if (!ts_entry) { - pr_info("qtaguid: %s(): tag_stat.tag 0x%llx not found. Abort.\n", - __func__, ppi->tag); - return NULL; - } - } - - if (*pos == ppi->tag_pos) { /* normal resume */ - ppi->item_index = ppi->tag_item_index; - } else { - /* seq_read skipped a next call */ - *pos = ppi->tag_pos; - ts_entry = qtaguid_stats_proc_next(m, ts_entry, pos); - } - - return ts_entry; -} - -static void qtaguid_stats_proc_stop(struct seq_file *m, void *v) -{ - struct proc_print_info *ppi = m->private; - if (ppi->iface_entry) - spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock); - spin_unlock_bh(&iface_stat_list_lock); -} - -/* - * Procfs reader to get all tag stats using style "1)" as described in - * fs/proc/generic.c - * Groups all protocols tx/rx bytes. - */ -static int qtaguid_stats_proc_show(struct seq_file *m, void *v) -{ - struct tag_stat *ts_entry = v; - - if (v == SEQ_START_TOKEN) - pp_stats_header(m); - else - pp_sets(m, ts_entry); - - return 0; -} - -/*------------------------------------------*/ -static int qtudev_open(struct inode *inode, struct file *file) -{ - struct uid_tag_data *utd_entry; - struct proc_qtu_data *pqd_entry; - struct proc_qtu_data *new_pqd_entry; - int res; - bool utd_entry_found; - - if (unlikely(qtu_proc_handling_passive)) - return 0; - - DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n", - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - - spin_lock_bh(&uid_tag_data_tree_lock); - - /* Look for existing uid data, or alloc one. */ - utd_entry = get_uid_data(from_kuid(&init_user_ns, current_fsuid()), &utd_entry_found); - if (IS_ERR_OR_NULL(utd_entry)) { - res = PTR_ERR(utd_entry); - goto err_unlock; - } - - /* Look for existing PID based proc_data */ - pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree, - current->tgid); - if (pqd_entry) { - pr_err("qtaguid: qtudev_open(): %u/%u %u " - "%s already opened\n", - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()), - QTU_DEV_NAME); - res = -EBUSY; - goto err_unlock_free_utd; - } - - new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC); - if (!new_pqd_entry) { - pr_err("qtaguid: qtudev_open(): %u/%u %u: " - "proc data alloc failed\n", - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - res = -ENOMEM; - goto err_unlock_free_utd; - } - new_pqd_entry->pid = current->tgid; - INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list); - new_pqd_entry->parent_tag_data = utd_entry; - utd_entry->num_pqd++; - - proc_qtu_data_tree_insert(new_pqd_entry, - &proc_qtu_data_tree); - - spin_unlock_bh(&uid_tag_data_tree_lock); - DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n", - from_kuid(&init_user_ns, current_fsuid()), new_pqd_entry); - file->private_data = new_pqd_entry; - return 0; - -err_unlock_free_utd: - if (!utd_entry_found) { - rb_erase(&utd_entry->node, &uid_tag_data_tree); - kfree(utd_entry); - } -err_unlock: - spin_unlock_bh(&uid_tag_data_tree_lock); - return res; -} - -static int qtudev_release(struct inode *inode, struct file *file) -{ - struct proc_qtu_data *pqd_entry = file->private_data; - struct uid_tag_data *utd_entry = pqd_entry->parent_tag_data; - struct sock_tag *st_entry; - struct rb_root st_to_free_tree = RB_ROOT; - struct list_head *entry, *next; - struct tag_ref *tr; - - if (unlikely(qtu_proc_handling_passive)) - return 0; - - /* - * Do not trust the current->pid, it might just be a kworker cleaning - * up after a dead proc. - */ - DR_DEBUG("qtaguid: qtudev_release(): " - "pid=%u tgid=%u uid=%u " - "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n", - current->pid, current->tgid, pqd_entry->parent_tag_data->uid, - pqd_entry, pqd_entry->pid, utd_entry, - utd_entry->num_active_tags); - - spin_lock_bh(&sock_tag_list_lock); - spin_lock_bh(&uid_tag_data_tree_lock); - - list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) { - st_entry = list_entry(entry, struct sock_tag, list); - DR_DEBUG("qtaguid: %s(): " - "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n", - __func__, - st_entry, st_entry->sk, - current->pid, current->tgid, - pqd_entry->parent_tag_data->uid); - - utd_entry = uid_tag_data_tree_search( - &uid_tag_data_tree, - get_uid_from_tag(st_entry->tag)); - BUG_ON(IS_ERR_OR_NULL(utd_entry)); - DR_DEBUG("qtaguid: %s(): " - "looking for tag=0x%llx in utd_entry=%p\n", __func__, - st_entry->tag, utd_entry); - tr = tag_ref_tree_search(&utd_entry->tag_ref_tree, - st_entry->tag); - BUG_ON(!tr); - BUG_ON(tr->num_sock_tags <= 0); - tr->num_sock_tags--; - free_tag_ref_from_utd_entry(tr, utd_entry); - - rb_erase(&st_entry->sock_node, &sock_tag_tree); - list_del(&st_entry->list); - /* Can't sockfd_put() within spinlock, do it later. */ - sock_tag_tree_insert(st_entry, &st_to_free_tree); - - /* - * Try to free the utd_entry if no other proc_qtu_data is - * using it (num_pqd is 0) and it doesn't have active tags - * (num_active_tags is 0). - */ - put_utd_entry(utd_entry); - } - - rb_erase(&pqd_entry->node, &proc_qtu_data_tree); - BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1); - pqd_entry->parent_tag_data->num_pqd--; - put_utd_entry(pqd_entry->parent_tag_data); - kfree(pqd_entry); - file->private_data = NULL; - - spin_unlock_bh(&uid_tag_data_tree_lock); - spin_unlock_bh(&sock_tag_list_lock); - - - sock_tag_tree_erase(&st_to_free_tree); - - spin_lock_bh(&sock_tag_list_lock); - prdebug_full_state_locked(0, "%s(): pid=%u tgid=%u", __func__, - current->pid, current->tgid); - spin_unlock_bh(&sock_tag_list_lock); - return 0; -} - -/*------------------------------------------*/ -static const struct file_operations qtudev_fops = { - .owner = THIS_MODULE, - .open = qtudev_open, - .release = qtudev_release, -}; - -static struct miscdevice qtu_device = { - .minor = MISC_DYNAMIC_MINOR, - .name = QTU_DEV_NAME, - .fops = &qtudev_fops, - /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */ -}; - -static const struct seq_operations proc_qtaguid_ctrl_seqops = { - .start = qtaguid_ctrl_proc_start, - .next = qtaguid_ctrl_proc_next, - .stop = qtaguid_ctrl_proc_stop, - .show = qtaguid_ctrl_proc_show, -}; - -static int proc_qtaguid_ctrl_open(struct inode *inode, struct file *file) -{ - return seq_open_private(file, &proc_qtaguid_ctrl_seqops, - sizeof(struct proc_ctrl_print_info)); -} - -static const struct file_operations proc_qtaguid_ctrl_fops = { - .open = proc_qtaguid_ctrl_open, - .read = seq_read, - .write = qtaguid_ctrl_proc_write, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -static const struct seq_operations proc_qtaguid_stats_seqops = { - .start = qtaguid_stats_proc_start, - .next = qtaguid_stats_proc_next, - .stop = qtaguid_stats_proc_stop, - .show = qtaguid_stats_proc_show, -}; - -static int proc_qtaguid_stats_open(struct inode *inode, struct file *file) -{ - return seq_open_private(file, &proc_qtaguid_stats_seqops, - sizeof(struct proc_print_info)); -} - -static const struct file_operations proc_qtaguid_stats_fops = { - .open = proc_qtaguid_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -/*------------------------------------------*/ -static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir) -{ - int ret; - *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net); - if (!*res_procdir) { - pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n"); - ret = -ENOMEM; - goto no_dir; - } - - xt_qtaguid_ctrl_file = proc_create_data("ctrl", proc_ctrl_perms, - *res_procdir, - &proc_qtaguid_ctrl_fops, - NULL); - if (!xt_qtaguid_ctrl_file) { - pr_err("qtaguid: failed to create xt_qtaguid/ctrl " - " file\n"); - ret = -ENOMEM; - goto no_ctrl_entry; - } - - xt_qtaguid_stats_file = proc_create_data("stats", proc_stats_perms, - *res_procdir, - &proc_qtaguid_stats_fops, - NULL); - if (!xt_qtaguid_stats_file) { - pr_err("qtaguid: failed to create xt_qtaguid/stats " - "file\n"); - ret = -ENOMEM; - goto no_stats_entry; - } - /* - * TODO: add support counter hacking - * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write; - */ - return 0; - -no_stats_entry: - remove_proc_entry("ctrl", *res_procdir); -no_ctrl_entry: - remove_proc_entry("xt_qtaguid", NULL); -no_dir: - return ret; -} - -static struct xt_match qtaguid_mt_reg __read_mostly = { - /* - * This module masquerades as the "owner" module so that iptables - * tools can deal with it. - */ - .name = "owner", - .revision = 1, - .family = NFPROTO_UNSPEC, - .match = qtaguid_mt, - .matchsize = sizeof(struct xt_qtaguid_match_info), - .me = THIS_MODULE, -}; - -static int __init qtaguid_mt_init(void) -{ - if (qtaguid_proc_register(&xt_qtaguid_procdir) - || iface_stat_init(xt_qtaguid_procdir) - || xt_register_match(&qtaguid_mt_reg) - || misc_register(&qtu_device)) - return -1; - return 0; -} - -/* - * TODO: allow unloading of the module. - * For now stats are permanent. - * Kconfig forces'y/n' and never an 'm'. - */ - -module_init(qtaguid_mt_init); -MODULE_AUTHOR("jpa <jpa@google.com>"); -MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ipt_owner"); -MODULE_ALIAS("ip6t_owner"); -MODULE_ALIAS("ipt_qtaguid"); -MODULE_ALIAS("ip6t_qtaguid"); diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h deleted file mode 100644 index c7052707a6a4..000000000000 --- a/net/netfilter/xt_qtaguid_internal.h +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Kernel iptables module to track stats for packets based on user tags. - * - * (C) 2011 Google, Inc - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef __XT_QTAGUID_INTERNAL_H__ -#define __XT_QTAGUID_INTERNAL_H__ - -#include <linux/types.h> -#include <linux/rbtree.h> -#include <linux/spinlock_types.h> -#include <linux/workqueue.h> - -/* Iface handling */ -#define IDEBUG_MASK (1<<0) -/* Iptable Matching. Per packet. */ -#define MDEBUG_MASK (1<<1) -/* Red-black tree handling. Per packet. */ -#define RDEBUG_MASK (1<<2) -/* procfs ctrl/stats handling */ -#define CDEBUG_MASK (1<<3) -/* dev and resource tracking */ -#define DDEBUG_MASK (1<<4) - -/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */ -#define DEFAULT_DEBUG_MASK 0 - -/* - * (Un)Define these *DEBUG to compile out/in the pr_debug calls. - * All undef: text size ~ 0x3030; all def: ~ 0x4404. - */ -#define IDEBUG -#define MDEBUG -#define RDEBUG -#define CDEBUG -#define DDEBUG - -#define MSK_DEBUG(mask, ...) do { \ - if (unlikely(qtaguid_debug_mask & (mask))) \ - pr_debug(__VA_ARGS__); \ - } while (0) -#ifdef IDEBUG -#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__) -#else -#define IF_DEBUG(...) no_printk(__VA_ARGS__) -#endif -#ifdef MDEBUG -#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__) -#else -#define MT_DEBUG(...) no_printk(__VA_ARGS__) -#endif -#ifdef RDEBUG -#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__) -#else -#define RB_DEBUG(...) no_printk(__VA_ARGS__) -#endif -#ifdef CDEBUG -#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__) -#else -#define CT_DEBUG(...) no_printk(__VA_ARGS__) -#endif -#ifdef DDEBUG -#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__) -#else -#define DR_DEBUG(...) no_printk(__VA_ARGS__) -#endif - -extern uint qtaguid_debug_mask; - -/*---------------------------------------------------------------------------*/ -/* - * Tags: - * - * They represent what the data usage counters will be tracked against. - * By default a tag is just based on the UID. - * The UID is used as the base for policing, and can not be ignored. - * So a tag will always at least represent a UID (uid_tag). - * - * A tag can be augmented with an "accounting tag" which is associated - * with a UID. - * User space can set the acct_tag portion of the tag which is then used - * with sockets: all data belonging to that socket will be counted against the - * tag. The policing is then based on the tag's uid_tag portion, - * and stats are collected for the acct_tag portion separately. - * - * There could be - * a: {acct_tag=1, uid_tag=10003} - * b: {acct_tag=2, uid_tag=10003} - * c: {acct_tag=3, uid_tag=10003} - * d: {acct_tag=0, uid_tag=10003} - * a, b, and c represent tags associated with specific sockets. - * d is for the totals for that uid, including all untagged traffic. - * Typically d is used with policing/quota rules. - * - * We want tag_t big enough to distinguish uid_t and acct_tag. - * It might become a struct if needed. - * Nothing should be using it as an int. - */ -typedef uint64_t tag_t; /* Only used via accessors */ - -#define TAG_UID_MASK 0xFFFFFFFFULL -#define TAG_ACCT_MASK (~0xFFFFFFFFULL) - -static inline int tag_compare(tag_t t1, tag_t t2) -{ - return t1 < t2 ? -1 : t1 == t2 ? 0 : 1; -} - -static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid) -{ - return acct_tag | uid; -} -static inline tag_t make_tag_from_uid(uid_t uid) -{ - return uid; -} -static inline uid_t get_uid_from_tag(tag_t tag) -{ - return tag & TAG_UID_MASK; -} -static inline tag_t get_utag_from_tag(tag_t tag) -{ - return tag & TAG_UID_MASK; -} -static inline tag_t get_atag_from_tag(tag_t tag) -{ - return tag & TAG_ACCT_MASK; -} - -static inline bool valid_atag(tag_t tag) -{ - return !(tag & TAG_UID_MASK); -} -static inline tag_t make_atag_from_value(uint32_t value) -{ - return (uint64_t)value << 32; -} -/*---------------------------------------------------------------------------*/ - -/* - * Maximum number of socket tags that a UID is allowed to have active. - * Multiple processes belonging to the same UID contribute towards this limit. - * Special UIDs that can impersonate a UID also contribute (e.g. download - * manager, ...) - */ -#define DEFAULT_MAX_SOCK_TAGS 1024 - -/* - * For now we only track 2 sets of counters. - * The default set is 0. - * Userspace can activate another set for a given uid being tracked. - */ -#define IFS_MAX_COUNTER_SETS 2 - -enum ifs_tx_rx { - IFS_TX, - IFS_RX, - IFS_MAX_DIRECTIONS -}; - -/* For now, TCP, UDP, the rest */ -enum ifs_proto { - IFS_TCP, - IFS_UDP, - IFS_PROTO_OTHER, - IFS_MAX_PROTOS -}; - -struct byte_packet_counters { - uint64_t bytes; - uint64_t packets; -}; - -struct data_counters { - struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS]; -}; - -static inline uint64_t dc_sum_bytes(struct data_counters *counters, - int set, - enum ifs_tx_rx direction) -{ - return counters->bpc[set][direction][IFS_TCP].bytes - + counters->bpc[set][direction][IFS_UDP].bytes - + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes; -} - -static inline uint64_t dc_sum_packets(struct data_counters *counters, - int set, - enum ifs_tx_rx direction) -{ - return counters->bpc[set][direction][IFS_TCP].packets - + counters->bpc[set][direction][IFS_UDP].packets - + counters->bpc[set][direction][IFS_PROTO_OTHER].packets; -} - - -/* Generic X based nodes used as a base for rb_tree ops */ -struct tag_node { - struct rb_node node; - tag_t tag; -}; - -struct tag_stat { - struct tag_node tn; - struct data_counters counters; - /* - * If this tag is acct_tag based, we need to count against the - * matching parent uid_tag. - */ - struct data_counters *parent_counters; -}; - -struct iface_stat { - struct list_head list; /* in iface_stat_list */ - char *ifname; - bool active; - /* net_dev is only valid for active iface_stat */ - struct net_device *net_dev; - - struct byte_packet_counters totals_via_dev[IFS_MAX_DIRECTIONS]; - struct data_counters totals_via_skb; - /* - * We keep the last_known, because some devices reset their counters - * just before NETDEV_UP, while some will reset just before - * NETDEV_REGISTER (which is more normal). - * So now, if the device didn't do a NETDEV_UNREGISTER and we see - * its current dev stats smaller that what was previously known, we - * assume an UNREGISTER and just use the last_known. - */ - struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS]; - /* last_known is usable when last_known_valid is true */ - bool last_known_valid; - - struct proc_dir_entry *proc_ptr; - - struct rb_root tag_stat_tree; - spinlock_t tag_stat_list_lock; -}; - -/* This is needed to create proc_dir_entries from atomic context. */ -struct iface_stat_work { - struct work_struct iface_work; - struct iface_stat *iface_entry; -}; - -/* - * Track tag that this socket is transferring data for, and not necessarily - * the uid that owns the socket. - * This is the tag against which tag_stat.counters will be billed. - * These structs need to be looked up by sock and pid. - */ -struct sock_tag { - struct rb_node sock_node; - struct sock *sk; /* Only used as a number, never dereferenced */ - /* Used to associate with a given pid */ - struct list_head list; /* in proc_qtu_data.sock_tag_list */ - pid_t pid; - - tag_t tag; -}; - -struct qtaguid_event_counts { - /* Various successful events */ - atomic64_t sockets_tagged; - atomic64_t sockets_untagged; - atomic64_t counter_set_changes; - atomic64_t delete_cmds; - atomic64_t iface_events; /* Number of NETDEV_* events handled */ - - atomic64_t match_calls; /* Number of times iptables called mt */ - /* Number of times iptables called mt from pre or post routing hooks */ - atomic64_t match_calls_prepost; - /* - * match_found_sk_*: numbers related to the netfilter matching - * function finding a sock for the sk_buff. - * Total skbs processed is sum(match_found*). - */ - atomic64_t match_found_sk; /* An sk was already in the sk_buff. */ - /* The connection tracker had or didn't have the sk. */ - atomic64_t match_found_sk_in_ct; - atomic64_t match_found_no_sk_in_ct; - /* - * No sk could be found. No apparent owner. Could happen with - * unsolicited traffic. - */ - atomic64_t match_no_sk; - /* - * The file ptr in the sk_socket wasn't there and we couldn't get GID. - * This might happen for traffic while the socket is being closed. - */ - atomic64_t match_no_sk_gid; -}; - -/* Track the set active_set for the given tag. */ -struct tag_counter_set { - struct tag_node tn; - int active_set; -}; - -/*----------------------------------------------*/ -/* - * The qtu uid data is used to track resources that are created directly or - * indirectly by processes (uid tracked). - * It is shared by the processes with the same uid. - * Some of the resource will be counted to prevent further rogue allocations, - * some will need freeing once the owner process (uid) exits. - */ -struct uid_tag_data { - struct rb_node node; - uid_t uid; - - /* - * For the uid, how many accounting tags have been set. - */ - int num_active_tags; - /* Track the number of proc_qtu_data that reference it */ - int num_pqd; - struct rb_root tag_ref_tree; - /* No tag_node_tree_lock; use uid_tag_data_tree_lock */ -}; - -struct tag_ref { - struct tag_node tn; - - /* - * This tracks the number of active sockets that have a tag on them - * which matches this tag_ref.tn.tag. - * A tag ref can live on after the sockets are untagged. - * A tag ref can only be removed during a tag delete command. - */ - int num_sock_tags; -}; - -struct proc_qtu_data { - struct rb_node node; - pid_t pid; - - struct uid_tag_data *parent_tag_data; - - /* Tracks the sock_tags that need freeing upon this proc's death */ - struct list_head sock_tag_list; - /* No spinlock_t sock_tag_list_lock; use the global one. */ -}; - -/*----------------------------------------------*/ -#endif /* ifndef __XT_QTAGUID_INTERNAL_H__ */ diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c deleted file mode 100644 index 2a7190d285e6..000000000000 --- a/net/netfilter/xt_qtaguid_print.c +++ /dev/null @@ -1,566 +0,0 @@ -/* - * Pretty printing Support for iptables xt_qtaguid module. - * - * (C) 2011 Google, Inc - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * Most of the functions in this file just waste time if DEBUG is not defined. - * The matching xt_qtaguid_print.h will static inline empty funcs if the needed - * debug flags ore not defined. - * Those funcs that fail to allocate memory will panic as there is no need to - * hobble allong just pretending to do the requested work. - */ - -#define DEBUG - -#include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/net.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include <linux/spinlock_types.h> -#include <net/sock.h> - -#include "xt_qtaguid_internal.h" -#include "xt_qtaguid_print.h" - -#ifdef DDEBUG - -static void _bug_on_err_or_null(void *ptr) -{ - if (IS_ERR_OR_NULL(ptr)) { - pr_err("qtaguid: kmalloc failed\n"); - BUG(); - } -} - -char *pp_tag_t(tag_t *tag) -{ - char *res; - - if (!tag) - res = kasprintf(GFP_ATOMIC, "tag_t@null{}"); - else - res = kasprintf(GFP_ATOMIC, - "tag_t@%p{tag=0x%llx, uid=%u}", - tag, *tag, get_uid_from_tag(*tag)); - _bug_on_err_or_null(res); - return res; -} - -char *pp_data_counters(struct data_counters *dc, bool showValues) -{ - char *res; - - if (!dc) - res = kasprintf(GFP_ATOMIC, "data_counters@null{}"); - else if (showValues) - res = kasprintf( - GFP_ATOMIC, "data_counters@%p{" - "set0{" - "rx{" - "tcp{b=%llu, p=%llu}, " - "udp{b=%llu, p=%llu}," - "other{b=%llu, p=%llu}}, " - "tx{" - "tcp{b=%llu, p=%llu}, " - "udp{b=%llu, p=%llu}," - "other{b=%llu, p=%llu}}}, " - "set1{" - "rx{" - "tcp{b=%llu, p=%llu}, " - "udp{b=%llu, p=%llu}," - "other{b=%llu, p=%llu}}, " - "tx{" - "tcp{b=%llu, p=%llu}, " - "udp{b=%llu, p=%llu}," - "other{b=%llu, p=%llu}}}}", - dc, - dc->bpc[0][IFS_RX][IFS_TCP].bytes, - dc->bpc[0][IFS_RX][IFS_TCP].packets, - dc->bpc[0][IFS_RX][IFS_UDP].bytes, - dc->bpc[0][IFS_RX][IFS_UDP].packets, - dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes, - dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets, - dc->bpc[0][IFS_TX][IFS_TCP].bytes, - dc->bpc[0][IFS_TX][IFS_TCP].packets, - dc->bpc[0][IFS_TX][IFS_UDP].bytes, - dc->bpc[0][IFS_TX][IFS_UDP].packets, - dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes, - dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets, - dc->bpc[1][IFS_RX][IFS_TCP].bytes, - dc->bpc[1][IFS_RX][IFS_TCP].packets, - dc->bpc[1][IFS_RX][IFS_UDP].bytes, - dc->bpc[1][IFS_RX][IFS_UDP].packets, - dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes, - dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets, - dc->bpc[1][IFS_TX][IFS_TCP].bytes, - dc->bpc[1][IFS_TX][IFS_TCP].packets, - dc->bpc[1][IFS_TX][IFS_UDP].bytes, - dc->bpc[1][IFS_TX][IFS_UDP].packets, - dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes, - dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets); - else - res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc); - _bug_on_err_or_null(res); - return res; -} - -char *pp_tag_node(struct tag_node *tn) -{ - char *tag_str; - char *res; - - if (!tn) { - res = kasprintf(GFP_ATOMIC, "tag_node@null{}"); - _bug_on_err_or_null(res); - return res; - } - tag_str = pp_tag_t(&tn->tag); - res = kasprintf(GFP_ATOMIC, - "tag_node@%p{tag=%s}", - tn, tag_str); - _bug_on_err_or_null(res); - kfree(tag_str); - return res; -} - -char *pp_tag_ref(struct tag_ref *tr) -{ - char *tn_str; - char *res; - - if (!tr) { - res = kasprintf(GFP_ATOMIC, "tag_ref@null{}"); - _bug_on_err_or_null(res); - return res; - } - tn_str = pp_tag_node(&tr->tn); - res = kasprintf(GFP_ATOMIC, - "tag_ref@%p{%s, num_sock_tags=%d}", - tr, tn_str, tr->num_sock_tags); - _bug_on_err_or_null(res); - kfree(tn_str); - return res; -} - -char *pp_tag_stat(struct tag_stat *ts) -{ - char *tn_str; - char *counters_str; - char *parent_counters_str; - char *res; - - if (!ts) { - res = kasprintf(GFP_ATOMIC, "tag_stat@null{}"); - _bug_on_err_or_null(res); - return res; - } - tn_str = pp_tag_node(&ts->tn); - counters_str = pp_data_counters(&ts->counters, true); - parent_counters_str = pp_data_counters(ts->parent_counters, false); - res = kasprintf(GFP_ATOMIC, - "tag_stat@%p{%s, counters=%s, parent_counters=%s}", - ts, tn_str, counters_str, parent_counters_str); - _bug_on_err_or_null(res); - kfree(tn_str); - kfree(counters_str); - kfree(parent_counters_str); - return res; -} - -char *pp_iface_stat(struct iface_stat *is) -{ - char *res; - if (!is) { - res = kasprintf(GFP_ATOMIC, "iface_stat@null{}"); - } else { - struct data_counters *cnts = &is->totals_via_skb; - res = kasprintf(GFP_ATOMIC, "iface_stat@%p{" - "list=list_head{...}, " - "ifname=%s, " - "total_dev={rx={bytes=%llu, " - "packets=%llu}, " - "tx={bytes=%llu, " - "packets=%llu}}, " - "total_skb={rx={bytes=%llu, " - "packets=%llu}, " - "tx={bytes=%llu, " - "packets=%llu}}, " - "last_known_valid=%d, " - "last_known={rx={bytes=%llu, " - "packets=%llu}, " - "tx={bytes=%llu, " - "packets=%llu}}, " - "active=%d, " - "net_dev=%p, " - "proc_ptr=%p, " - "tag_stat_tree=rb_root{...}}", - is, - is->ifname, - is->totals_via_dev[IFS_RX].bytes, - is->totals_via_dev[IFS_RX].packets, - is->totals_via_dev[IFS_TX].bytes, - is->totals_via_dev[IFS_TX].packets, - dc_sum_bytes(cnts, 0, IFS_RX), - dc_sum_packets(cnts, 0, IFS_RX), - dc_sum_bytes(cnts, 0, IFS_TX), - dc_sum_packets(cnts, 0, IFS_TX), - is->last_known_valid, - is->last_known[IFS_RX].bytes, - is->last_known[IFS_RX].packets, - is->last_known[IFS_TX].bytes, - is->last_known[IFS_TX].packets, - is->active, - is->net_dev, - is->proc_ptr); - } - _bug_on_err_or_null(res); - return res; -} - -char *pp_sock_tag(struct sock_tag *st) -{ - char *tag_str; - char *res; - - if (!st) { - res = kasprintf(GFP_ATOMIC, "sock_tag@null{}"); - _bug_on_err_or_null(res); - return res; - } - tag_str = pp_tag_t(&st->tag); - res = kasprintf(GFP_ATOMIC, "sock_tag@%p{" - "sock_node=rb_node{...}, " - "sk=%p (f_count=%d), list=list_head{...}, " - "pid=%u, tag=%s}", - st, st->sk, atomic_read( - &st->sk->sk_refcnt), - st->pid, tag_str); - _bug_on_err_or_null(res); - kfree(tag_str); - return res; -} - -char *pp_uid_tag_data(struct uid_tag_data *utd) -{ - char *res; - - if (!utd) - res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}"); - else - res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{" - "uid=%u, num_active_acct_tags=%d, " - "num_pqd=%d, " - "tag_node_tree=rb_root{...}, " - "proc_qtu_data_tree=rb_root{...}}", - utd, utd->uid, - utd->num_active_tags, utd->num_pqd); - _bug_on_err_or_null(res); - return res; -} - -char *pp_proc_qtu_data(struct proc_qtu_data *pqd) -{ - char *parent_tag_data_str; - char *res; - - if (!pqd) { - res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}"); - _bug_on_err_or_null(res); - return res; - } - parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data); - res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{" - "node=rb_node{...}, pid=%u, " - "parent_tag_data=%s, " - "sock_tag_list=list_head{...}}", - pqd, pqd->pid, parent_tag_data_str - ); - _bug_on_err_or_null(res); - kfree(parent_tag_data_str); - return res; -} - -/*------------------------------------------*/ -void prdebug_sock_tag_tree(int indent_level, - struct rb_root *sock_tag_tree) -{ - struct rb_node *node; - struct sock_tag *sock_tag_entry; - char *str; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - if (RB_EMPTY_ROOT(sock_tag_tree)) { - str = "sock_tag_tree=rb_root{}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - return; - } - - str = "sock_tag_tree=rb_root{"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - indent_level++; - for (node = rb_first(sock_tag_tree); - node; - node = rb_next(node)) { - sock_tag_entry = rb_entry(node, struct sock_tag, sock_node); - str = pp_sock_tag(sock_tag_entry); - pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); - kfree(str); - } - indent_level--; - str = "}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); -} - -void prdebug_sock_tag_list(int indent_level, - struct list_head *sock_tag_list) -{ - struct sock_tag *sock_tag_entry; - char *str; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - if (list_empty(sock_tag_list)) { - str = "sock_tag_list=list_head{}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - return; - } - - str = "sock_tag_list=list_head{"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - indent_level++; - list_for_each_entry(sock_tag_entry, sock_tag_list, list) { - str = pp_sock_tag(sock_tag_entry); - pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); - kfree(str); - } - indent_level--; - str = "}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); -} - -void prdebug_proc_qtu_data_tree(int indent_level, - struct rb_root *proc_qtu_data_tree) -{ - char *str; - struct rb_node *node; - struct proc_qtu_data *proc_qtu_data_entry; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - if (RB_EMPTY_ROOT(proc_qtu_data_tree)) { - str = "proc_qtu_data_tree=rb_root{}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - return; - } - - str = "proc_qtu_data_tree=rb_root{"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - indent_level++; - for (node = rb_first(proc_qtu_data_tree); - node; - node = rb_next(node)) { - proc_qtu_data_entry = rb_entry(node, - struct proc_qtu_data, - node); - str = pp_proc_qtu_data(proc_qtu_data_entry); - pr_debug("%*d: %s,\n", indent_level*2, indent_level, - str); - kfree(str); - indent_level++; - prdebug_sock_tag_list(indent_level, - &proc_qtu_data_entry->sock_tag_list); - indent_level--; - - } - indent_level--; - str = "}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); -} - -void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree) -{ - char *str; - struct rb_node *node; - struct tag_ref *tag_ref_entry; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - if (RB_EMPTY_ROOT(tag_ref_tree)) { - str = "tag_ref_tree{}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - return; - } - - str = "tag_ref_tree{"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - indent_level++; - for (node = rb_first(tag_ref_tree); - node; - node = rb_next(node)) { - tag_ref_entry = rb_entry(node, - struct tag_ref, - tn.node); - str = pp_tag_ref(tag_ref_entry); - pr_debug("%*d: %s,\n", indent_level*2, indent_level, - str); - kfree(str); - } - indent_level--; - str = "}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); -} - -void prdebug_uid_tag_data_tree(int indent_level, - struct rb_root *uid_tag_data_tree) -{ - char *str; - struct rb_node *node; - struct uid_tag_data *uid_tag_data_entry; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - if (RB_EMPTY_ROOT(uid_tag_data_tree)) { - str = "uid_tag_data_tree=rb_root{}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - return; - } - - str = "uid_tag_data_tree=rb_root{"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - indent_level++; - for (node = rb_first(uid_tag_data_tree); - node; - node = rb_next(node)) { - uid_tag_data_entry = rb_entry(node, struct uid_tag_data, - node); - str = pp_uid_tag_data(uid_tag_data_entry); - pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); - kfree(str); - if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) { - indent_level++; - prdebug_tag_ref_tree(indent_level, - &uid_tag_data_entry->tag_ref_tree); - indent_level--; - } - } - indent_level--; - str = "}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); -} - -void prdebug_tag_stat_tree(int indent_level, - struct rb_root *tag_stat_tree) -{ - char *str; - struct rb_node *node; - struct tag_stat *ts_entry; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - if (RB_EMPTY_ROOT(tag_stat_tree)) { - str = "tag_stat_tree{}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - return; - } - - str = "tag_stat_tree{"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - indent_level++; - for (node = rb_first(tag_stat_tree); - node; - node = rb_next(node)) { - ts_entry = rb_entry(node, struct tag_stat, tn.node); - str = pp_tag_stat(ts_entry); - pr_debug("%*d: %s\n", indent_level*2, indent_level, - str); - kfree(str); - } - indent_level--; - str = "}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); -} - -void prdebug_iface_stat_list(int indent_level, - struct list_head *iface_stat_list) -{ - char *str; - struct iface_stat *iface_entry; - - if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) - return; - - if (list_empty(iface_stat_list)) { - str = "iface_stat_list=list_head{}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - return; - } - - str = "iface_stat_list=list_head{"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - indent_level++; - list_for_each_entry(iface_entry, iface_stat_list, list) { - str = pp_iface_stat(iface_entry); - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); - kfree(str); - - spin_lock_bh(&iface_entry->tag_stat_list_lock); - if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) { - indent_level++; - prdebug_tag_stat_tree(indent_level, - &iface_entry->tag_stat_tree); - indent_level--; - } - spin_unlock_bh(&iface_entry->tag_stat_list_lock); - } - indent_level--; - str = "}"; - pr_debug("%*d: %s\n", indent_level*2, indent_level, str); -} - -#endif /* ifdef DDEBUG */ -/*------------------------------------------*/ -static const char * const netdev_event_strings[] = { - "netdev_unknown", - "NETDEV_UP", - "NETDEV_DOWN", - "NETDEV_REBOOT", - "NETDEV_CHANGE", - "NETDEV_REGISTER", - "NETDEV_UNREGISTER", - "NETDEV_CHANGEMTU", - "NETDEV_CHANGEADDR", - "NETDEV_GOING_DOWN", - "NETDEV_CHANGENAME", - "NETDEV_FEAT_CHANGE", - "NETDEV_BONDING_FAILOVER", - "NETDEV_PRE_UP", - "NETDEV_PRE_TYPE_CHANGE", - "NETDEV_POST_TYPE_CHANGE", - "NETDEV_POST_INIT", - "NETDEV_UNREGISTER_BATCH", - "NETDEV_RELEASE", - "NETDEV_NOTIFY_PEERS", - "NETDEV_JOIN", -}; - -const char *netdev_evt_str(int netdev_event) -{ - if (netdev_event < 0 - || netdev_event >= ARRAY_SIZE(netdev_event_strings)) - return "bad event num"; - return netdev_event_strings[netdev_event]; -} diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h deleted file mode 100644 index b63871a0be5a..000000000000 --- a/net/netfilter/xt_qtaguid_print.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Pretty printing Support for iptables xt_qtaguid module. - * - * (C) 2011 Google, Inc - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef __XT_QTAGUID_PRINT_H__ -#define __XT_QTAGUID_PRINT_H__ - -#include "xt_qtaguid_internal.h" - -#ifdef DDEBUG - -char *pp_tag_t(tag_t *tag); -char *pp_data_counters(struct data_counters *dc, bool showValues); -char *pp_tag_node(struct tag_node *tn); -char *pp_tag_ref(struct tag_ref *tr); -char *pp_tag_stat(struct tag_stat *ts); -char *pp_iface_stat(struct iface_stat *is); -char *pp_sock_tag(struct sock_tag *st); -char *pp_uid_tag_data(struct uid_tag_data *qtd); -char *pp_proc_qtu_data(struct proc_qtu_data *pqd); - -/*------------------------------------------*/ -void prdebug_sock_tag_list(int indent_level, - struct list_head *sock_tag_list); -void prdebug_sock_tag_tree(int indent_level, - struct rb_root *sock_tag_tree); -void prdebug_proc_qtu_data_tree(int indent_level, - struct rb_root *proc_qtu_data_tree); -void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree); -void prdebug_uid_tag_data_tree(int indent_level, - struct rb_root *uid_tag_data_tree); -void prdebug_tag_stat_tree(int indent_level, - struct rb_root *tag_stat_tree); -void prdebug_iface_stat_list(int indent_level, - struct list_head *iface_stat_list); - -#else - -/*------------------------------------------*/ -static inline char *pp_tag_t(tag_t *tag) -{ - return NULL; -} -static inline char *pp_data_counters(struct data_counters *dc, bool showValues) -{ - return NULL; -} -static inline char *pp_tag_node(struct tag_node *tn) -{ - return NULL; -} -static inline char *pp_tag_ref(struct tag_ref *tr) -{ - return NULL; -} -static inline char *pp_tag_stat(struct tag_stat *ts) -{ - return NULL; -} -static inline char *pp_iface_stat(struct iface_stat *is) -{ - return NULL; -} -static inline char *pp_sock_tag(struct sock_tag *st) -{ - return NULL; -} -static inline char *pp_uid_tag_data(struct uid_tag_data *qtd) -{ - return NULL; -} -static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd) -{ - return NULL; -} - -/*------------------------------------------*/ -static inline -void prdebug_sock_tag_list(int indent_level, - struct list_head *sock_tag_list) -{ -} -static inline -void prdebug_sock_tag_tree(int indent_level, - struct rb_root *sock_tag_tree) -{ -} -static inline -void prdebug_proc_qtu_data_tree(int indent_level, - struct rb_root *proc_qtu_data_tree) -{ -} -static inline -void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree) -{ -} -static inline -void prdebug_uid_tag_data_tree(int indent_level, - struct rb_root *uid_tag_data_tree) -{ -} -static inline -void prdebug_tag_stat_tree(int indent_level, - struct rb_root *tag_stat_tree) -{ -} -static inline -void prdebug_iface_stat_list(int indent_level, - struct list_head *iface_stat_list) -{ -} -#endif -/*------------------------------------------*/ -const char *netdev_evt_str(int netdev_event); -#endif /* ifndef __XT_QTAGUID_PRINT_H__ */ diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 44c8eb4c9d66..10d61a6eed71 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -73,6 +73,7 @@ static struct xt_match quota_mt_reg __read_mostly = { .checkentry = quota_mt_check, .destroy = quota_mt_destroy, .matchsize = sizeof(struct xt_quota_info), + .usersize = offsetof(struct xt_quota_info, master), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c index 8c481e359d94..cb7b53f6dade 100644 --- a/net/netfilter/xt_quota2.c +++ b/net/netfilter/xt_quota2.c @@ -136,6 +136,8 @@ static ssize_t quota_proc_write(struct file *file, const char __user *input, if (copy_from_user(buf, input, size) != 0) return -EFAULT; buf[sizeof(buf)-1] = '\0'; + if (size < sizeof(buf)) + buf[size] = '\0'; spin_lock_bh(&e->lock); e->quota = simple_strtoull(buf, NULL, 0); @@ -283,6 +285,8 @@ quota_mt2(const struct sk_buff *skb, struct xt_action_param *par) { struct xt_quota_mtinfo2 *q = (void *)par->matchinfo; struct xt_quota_counter *e = q->master; + int charge = (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; + bool no_change = q->flags & XT_QUOTA_NO_CHANGE; bool ret = q->flags & XT_QUOTA_INVERT; spin_lock_bh(&e->lock); @@ -291,20 +295,17 @@ quota_mt2(const struct sk_buff *skb, struct xt_action_param *par) * While no_change is pointless in "grow" mode, we will * implement it here simply to have a consistent behavior. */ - if (!(q->flags & XT_QUOTA_NO_CHANGE)) { - e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; - } - ret = true; + if (!no_change) + e->quota += charge; + ret = true; /* note: does not respect inversion (bug??) */ } else { - if (e->quota >= skb->len) { - if (!(q->flags & XT_QUOTA_NO_CHANGE)) - e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; + if (e->quota > charge) { + if (!no_change) + e->quota -= charge; ret = !ret; - } else { + } else if (e->quota) { /* We are transitioning, log that fact. */ - if (e->quota) { - quota2_log(par->in, par->out, e, q->name); - } + quota2_log(par->in, par->out, e, q->name); /* we do not allow even small packets from now on */ e->quota = 0; } @@ -322,6 +323,7 @@ static struct xt_match quota_mt2_reg[] __read_mostly = { .match = quota_mt2, .destroy = quota_mt2_destroy, .matchsize = sizeof(struct xt_quota_mtinfo2), + .usersize = offsetof(struct xt_quota_mtinfo2, master), .me = THIS_MODULE, }, { @@ -332,6 +334,7 @@ static struct xt_match quota_mt2_reg[] __read_mostly = { .match = quota_mt2, .destroy = quota_mt2_destroy, .matchsize = sizeof(struct xt_quota_mtinfo2), + .usersize = offsetof(struct xt_quota_mtinfo2, master), .me = THIS_MODULE, }, }; diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 7720b036d76a..d46dc9ff591f 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -135,6 +135,7 @@ static struct xt_match xt_rateest_mt_reg __read_mostly = { .checkentry = xt_rateest_mt_checkentry, .destroy = xt_rateest_mt_destroy, .matchsize = sizeof(struct xt_rateest_match_info), + .usersize = offsetof(struct xt_rateest_match_info, est1), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index cd53b861a15c..ffe673c6a248 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -156,7 +156,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) /* * Drop entries with timestamps older then 'time'. */ -static void recent_entry_reap(struct recent_table *t, unsigned long time) +static void recent_entry_reap(struct recent_table *t, unsigned long time, + struct recent_entry *working, bool update) { struct recent_entry *e; @@ -166,6 +167,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time) e = list_entry(t->lru_list.next, struct recent_entry, lru_list); /* + * Do not reap the entry which are going to be updated. + */ + if (e == working && update) + return; + + /* * The last time stamp is the most recent. */ if (time_after(time, e->stamps[e->index-1])) @@ -307,7 +314,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) /* info->seconds must be non-zero */ if (info->check_set & XT_RECENT_REAP) - recent_entry_reap(t, time); + recent_entry_reap(t, time, e, + info->check_set & XT_RECENT_UPDATE && ret); } if (info->check_set & XT_RECENT_SET || diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 939821821fcb..c5d930670cf0 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb, * box. */ static struct sock * -xt_socket_get_sock_v4(struct net *net, const u8 protocol, +xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, + return __inet_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -148,6 +149,8 @@ struct sock *xt_socket_lookup_slow_v4(struct net *net, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; struct sock *sk = skb->sk; __be32 uninitialized_var(daddr), uninitialized_var(saddr); __be16 uninitialized_var(dport), uninitialized_var(sport); @@ -173,6 +176,10 @@ struct sock *xt_socket_lookup_slow_v4(struct net *net, sport = hp->source; daddr = iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = iph->protocol == IPPROTO_TCP ? + ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : + ip_hdrlen(skb) + sizeof(*hp); } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, @@ -205,9 +212,9 @@ struct sock *xt_socket_lookup_slow_v4(struct net *net, if (sk) atomic_inc(&sk->sk_refcnt); else - sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, - indev); + sk = xt_socket_get_sock_v4(dev_net(skb->dev), data_skb, doff, + protocol, saddr, daddr, sport, + dport, indev); return sk; } @@ -328,14 +335,15 @@ extract_icmp6_fields(const struct sk_buff *skb, } static struct sock * -xt_socket_get_sock_v6(struct net *net, const u8 protocol, +xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, &tcp_hashinfo, + return inet6_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -354,6 +362,8 @@ struct sock *xt_socket_lookup_slow_v6(struct net *net, __be16 uninitialized_var(dport), uninitialized_var(sport); const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; int thoff = 0, tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); @@ -375,6 +385,10 @@ struct sock *xt_socket_lookup_slow_v6(struct net *net, sport = hp->source; daddr = &iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = tproto == IPPROTO_TCP ? + thoff + __tcp_hdrlen((struct tcphdr *)hp) : + thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { struct ipv6hdr ipv6_var; @@ -389,8 +403,8 @@ struct sock *xt_socket_lookup_slow_v6(struct net *net, if (sk) atomic_inc(&sk->sk_refcnt); else - sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, - saddr, daddr, sport, dport, + sk = xt_socket_get_sock_v6(dev_net(skb->dev), data_skb, doff, + tproto, saddr, daddr, sport, dport, indev); return sk; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 11de55e7a868..8710fdba2ae2 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -84,6 +84,7 @@ static struct xt_match xt_statistic_mt_reg __read_mostly = { .checkentry = statistic_mt_check, .destroy = statistic_mt_destroy, .matchsize = sizeof(struct xt_statistic_info), + .usersize = offsetof(struct xt_statistic_info, master), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 0bc3460319c8..423293ee57c2 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -77,6 +77,7 @@ static struct xt_match xt_string_mt_reg __read_mostly = { .match = string_mt, .destroy = string_mt_destroy, .matchsize = sizeof(struct xt_string_info), + .usersize = offsetof(struct xt_string_info, config), .me = THIS_MODULE, }; diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 7fd1104ba900..422fac2a4a3c 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -163,8 +163,8 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, return -ENOMEM; doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); if (doi_def->map.std == NULL) { - ret_val = -ENOMEM; - goto add_std_failure; + kfree(doi_def); + return -ENOMEM; } doi_def->type = CIPSO_V4_MAP_TRANS; @@ -205,14 +205,14 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, } doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -279,7 +279,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.local = kcalloc( doi_def->map.std->cat.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -287,7 +287,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.cipso = kcalloc( doi_def->map.std->cat.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index bfa2b6d5b5cf..25ab12e25e05 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -605,6 +605,12 @@ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, if ((off & (BITS_PER_LONG - 1)) != 0) return -EINVAL; + /* a null catmap is equivalent to an empty one */ + if (!catmap) { + *offset = (u32)-1; + return 0; + } + if (off < catmap->startbit) { off = catmap->startbit; *offset = off; diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 13f777f20995..5f1218dc9162 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -92,6 +92,7 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { static int netlbl_mgmt_add_common(struct genl_info *info, struct netlbl_audit *audit_info) { + void *pmap = NULL; int ret_val = -EINVAL; struct netlbl_domaddr_map *addrmap = NULL; struct cipso_v4_doi *cipsov4 = NULL; @@ -165,6 +166,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = addr->s_addr & mask->s_addr; map->list.mask = mask->s_addr; map->list.valid = 1; @@ -173,10 +175,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.cipso = cipsov4; ret_val = netlbl_af4list_add(&map->list, &addrmap->list4); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; @@ -212,6 +212,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = *addr; map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; @@ -222,10 +223,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.type = entry->def.type; ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; @@ -234,10 +233,12 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) - goto add_free_addrmap; + goto add_free_map; return 0; +add_free_map: + kfree(pmap); add_free_addrmap: kfree(addrmap); add_doi_put_def: diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 9f4ec16abfcf..5210f5546e3e 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1185,12 +1185,13 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; - u32 iter_bkt; - u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; + u32 skip_addr4 = cb->args[2]; + u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) + u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif @@ -1201,7 +1202,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; - iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { + iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || @@ -1209,7 +1210,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { - if (iter_addr4++ < cb->args[2]) + if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1222,10 +1223,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr4 = 0; + skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { - if (iter_addr6++ < cb->args[3]) + if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1238,8 +1241,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr6 = 0; + skip_addr6 = 0; #endif /* IPv6 */ } + iter_chain = 0; + skip_chain = 0; } unlabel_staticlist_return: diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6e5c14309fb8..98d8453eee27 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -419,11 +419,13 @@ void netlink_table_ungrab(void) static inline void netlink_lock_table(void) { + unsigned long flags; + /* read_lock() synchronizes us to netlink_table_grab */ - read_lock(&nl_table_lock); + read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); - read_unlock(&nl_table_lock); + read_unlock_irqrestore(&nl_table_lock, flags); } static inline void @@ -556,7 +558,10 @@ static int netlink_insert(struct sock *sk, u32 portid) /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); - nlk_sk(sk)->bound = portid; + /* Paired with lockless reads from netlink_bind(), + * netlink_connect() and netlink_sendmsg(). + */ + WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); @@ -971,7 +976,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, else if (nlk->ngroups < 8*sizeof(groups)) groups &= (1UL << nlk->ngroups) - 1; - bound = nlk->bound; + /* Paired with WRITE_ONCE() in netlink_insert() */ + bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); @@ -983,7 +989,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->netlink_bind && groups) { int group; - for (group = 0; group < nlk->ngroups; group++) { + /* nl_groups is a u32, so cap the maximum groups we can bind */ + for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); @@ -1002,7 +1009,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { - netlink_undo_bind(nlk->ngroups, groups, sk); + netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); return err; } } @@ -1050,8 +1057,9 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, /* No need for barriers here as we return to user-space without * using any of the bound attributes. + * Paired with WRITE_ONCE() in netlink_insert(). */ - if (!nlk->bound) + if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { @@ -1776,6 +1784,11 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; + if (len == 0) { + pr_warn_once("Zero length message leads to an empty skb\n"); + return -ENODATA; + } + err = scm_send(sock, msg, &scm, true); if (err < 0) return err; @@ -1798,7 +1811,8 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->bound) { + /* Paired with WRITE_ONCE() in netlink_insert() */ + if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; @@ -2382,13 +2396,15 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); + if (err == -ESRCH) + err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); - if (!err || err == -ESRCH) + if (!err) err = err2; } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 8336c9607a70..3fc00c320a9f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1007,63 +1007,11 @@ static struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; -static int genl_bind(struct net *net, int group) -{ - int i, err = 0; - - down_read(&cb_lock); - for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { - struct genl_family *f; - - list_for_each_entry(f, genl_family_chain(i), family_list) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; - - if (!f->netnsok && net != &init_net) - err = -ENOENT; - else if (f->mcast_bind) - err = f->mcast_bind(net, fam_grp); - else - err = 0; - break; - } - } - } - up_read(&cb_lock); - - return err; -} - -static void genl_unbind(struct net *net, int group) -{ - int i; - - down_read(&cb_lock); - for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { - struct genl_family *f; - - list_for_each_entry(f, genl_family_chain(i), family_list) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; - - if (f->mcast_unbind) - f->mcast_unbind(net, fam_grp); - break; - } - } - } - up_read(&cb_lock); -} - static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, - .bind = genl_bind, - .unbind = genl_unbind, }; /* we'll bump the group number right afterwards */ diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index d72a4f1558f2..ef6a3d586591 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -199,6 +199,7 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, /* refcount initialized at 1 */ spin_unlock_bh(&nr_node_list_lock); + nr_neigh_put(nr_neigh); return 0; } nr_node_lock(nr_node); diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index f0ecaec1ff3d..d1a0b7056743 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -125,11 +125,9 @@ static void nr_heartbeat_expiry(unsigned long param) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { - sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); - sock_put(sk); - return; + goto out; } break; @@ -150,6 +148,8 @@ static void nr_heartbeat_expiry(unsigned long param) nr_start_heartbeat(sk); bh_unlock_sock(sk); +out: + sock_put(sk); } static void nr_t2timer_expiry(unsigned long param) @@ -163,6 +163,7 @@ static void nr_t2timer_expiry(unsigned long param) nr_enquiry_response(sk); } bh_unlock_sock(sk); + sock_put(sk); } static void nr_t4timer_expiry(unsigned long param) @@ -172,6 +173,7 @@ static void nr_t4timer_expiry(unsigned long param) bh_lock_sock(sk); nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY; bh_unlock_sock(sk); + sock_put(sk); } static void nr_idletimer_expiry(unsigned long param) @@ -200,6 +202,7 @@ static void nr_idletimer_expiry(unsigned long param) sock_set_flag(sk, SOCK_DEAD); } bh_unlock_sock(sk); + sock_put(sk); } static void nr_t1timer_expiry(unsigned long param) @@ -212,8 +215,7 @@ static void nr_t1timer_expiry(unsigned long param) case NR_STATE_1: if (nr->n2count == nr->n2) { nr_disconnect(sk, ETIMEDOUT); - bh_unlock_sock(sk); - return; + goto out; } else { nr->n2count++; nr_write_internal(sk, NR_CONNREQ); @@ -223,8 +225,7 @@ static void nr_t1timer_expiry(unsigned long param) case NR_STATE_2: if (nr->n2count == nr->n2) { nr_disconnect(sk, ETIMEDOUT); - bh_unlock_sock(sk); - return; + goto out; } else { nr->n2count++; nr_write_internal(sk, NR_DISCREQ); @@ -234,8 +235,7 @@ static void nr_t1timer_expiry(unsigned long param) case NR_STATE_3: if (nr->n2count == nr->n2) { nr_disconnect(sk, ETIMEDOUT); - bh_unlock_sock(sk); - return; + goto out; } else { nr->n2count++; nr_requeue_frames(sk); @@ -244,5 +244,7 @@ static void nr_t1timer_expiry(unsigned long param) } nr_start_t1timer(sk); +out: bh_unlock_sock(sk); + sock_put(sk); } diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 54e40fa47822..1859b8e98ded 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -72,6 +72,9 @@ int nfc_proto_register(const struct nfc_protocol *nfc_proto) proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); + if (rc) + proto_unregister(nfc_proto->proto); + return rc; } EXPORT_SYMBOL(nfc_proto_register); diff --git a/net/nfc/core.c b/net/nfc/core.c index 1471e4b0aa2c..8c7f221e1d12 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -106,13 +106,13 @@ int nfc_dev_up(struct nfc_dev *dev) device_lock(&dev->dev); - if (dev->rfkill && rfkill_blocked(dev->rfkill)) { - rc = -ERFKILL; + if (!device_is_registered(&dev->dev)) { + rc = -ENODEV; goto error; } - if (!device_is_registered(&dev->dev)) { - rc = -ENODEV; + if (dev->rfkill && rfkill_blocked(dev->rfkill)) { + rc = -ERFKILL; goto error; } @@ -1120,11 +1120,7 @@ int nfc_register_device(struct nfc_dev *dev) if (rc) pr_err("Could not register llcp device\n"); - rc = nfc_genl_device_added(dev); - if (rc) - pr_debug("The userspace won't be notified that the device %s was added\n", - dev_name(&dev->dev)); - + device_lock(&dev->dev); dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); if (dev->rfkill) { @@ -1133,6 +1129,12 @@ int nfc_register_device(struct nfc_dev *dev) dev->rfkill = NULL; } } + device_unlock(&dev->dev); + + rc = nfc_genl_device_added(dev); + if (rc) + pr_debug("The userspace won't be notified that the device %s was added\n", + dev_name(&dev->dev)); return 0; } @@ -1149,10 +1151,17 @@ void nfc_unregister_device(struct nfc_dev *dev) pr_debug("dev_name=%s\n", dev_name(&dev->dev)); + rc = nfc_genl_device_removed(dev); + if (rc) + pr_debug("The userspace won't be notified that the device %s " + "was removed\n", dev_name(&dev->dev)); + + device_lock(&dev->dev); if (dev->rfkill) { rfkill_unregister(dev->rfkill); rfkill_destroy(dev->rfkill); } + device_unlock(&dev->dev); if (dev->ops->check_presence) { device_lock(&dev->dev); @@ -1162,11 +1171,6 @@ void nfc_unregister_device(struct nfc_dev *dev) cancel_work_sync(&dev->check_pres_work); } - rc = nfc_genl_device_removed(dev); - if (rc) - pr_debug("The userspace won't be notified that the device %s " - "was removed\n", dev_name(&dev->dev)); - nfc_llcp_unregister_device(dev); mutex_lock(&nfc_devlist_mutex); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 23c2a118ac9f..28c60e291c7e 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -280,6 +280,7 @@ int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) { struct digital_tg_mdaa_params *params; + int rc; params = kzalloc(sizeof(struct digital_tg_mdaa_params), GFP_KERNEL); if (!params) @@ -294,8 +295,12 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) get_random_bytes(params->nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); params->sc = DIGITAL_SENSF_FELICA_SC; - return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, - 500, digital_tg_recv_atr_req, NULL); + rc = digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, + 500, digital_tg_recv_atr_req, NULL); + if (rc) + kfree(params); + + return rc; } static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech) diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index f72be7433df3..46375ff214c0 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -1187,6 +1187,8 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, } rc = nfc_tm_data_received(ddev->nfc_dev, resp); + if (rc) + resp = NULL; exit: kfree_skb(ddev->chaining_skb); diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index fb58ed2dd41d..082dd95f6ef3 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -473,8 +473,12 @@ static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev, *skb_put(skb, sizeof(u8)) = sel_cmd; *skb_put(skb, sizeof(u8)) = DIGITAL_SDD_REQ_SEL_PAR; - return digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, - target); + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, + target); + if (rc) + kfree_skb(skb); + + return rc; } static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg, diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 5a58f9f38095..291f24fef19a 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -193,13 +193,20 @@ exit: void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, struct sk_buff *skb) { - u8 gate = hdev->pipes[pipe].gate; u8 status = NFC_HCI_ANY_OK; struct hci_create_pipe_resp *create_info; struct hci_delete_pipe_noti *delete_info; struct hci_all_pipe_cleared_noti *cleared_info; + u8 gate; - pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd); + pr_debug("from pipe %x cmd %x\n", pipe, cmd); + + if (pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + + gate = hdev->pipes[pipe].gate; switch (cmd) { case NFC_HCI_ADM_NOTIFY_PIPE_CREATED: @@ -387,8 +394,14 @@ void nfc_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe, u8 event, struct sk_buff *skb) { int r = 0; - u8 gate = hdev->pipes[pipe].gate; + u8 gate; + + if (pipe >= NFC_HCI_MAX_PIPES) { + pr_err("Discarded event %x to invalid pipe %x\n", event, pipe); + goto exit; + } + gate = hdev->pipes[pipe].gate; if (gate == NFC_HCI_INVALID_GATE) { pr_err("Discarded event %x to unopened pipe %x\n", event, pipe); goto exit; diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 44d6b8355eab..1d61a08eafaf 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -119,13 +119,19 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { + nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; + llcp_sock->dev = NULL; ret = -ENOMEM; goto put_dev; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { + nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; + llcp_sock->dev = NULL; ret = -EADDRINUSE; goto put_dev; } @@ -677,6 +683,10 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, ret = -EISCONN; goto error; } + if (sk->sk_state == LLCP_CONNECTING) { + ret = -EINPROGRESS; + goto error; + } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { @@ -708,6 +718,8 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->local = nfc_llcp_local_get(local); llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { + nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; ret = -ENOMEM; goto put_dev; } @@ -745,8 +757,12 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, sock_unlink: nfc_llcp_put_ssap(local, llcp_sock->ssap); + nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; nfc_llcp_sock_unlink(&local->connecting_sockets, sk); + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; put_dev: nfc_put_device(dev); @@ -774,6 +790,11 @@ static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); + if (!llcp_sock->local) { + release_sock(sk); + return -ENODEV; + } + if (sk->sk_type == SOCK_DGRAM) { DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr, msg->msg_name); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 67583ad7f610..d5d215776980 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -149,12 +149,15 @@ inline int nci_request(struct nci_dev *ndev, { int rc; - if (!test_bit(NCI_UP, &ndev->flags)) - return -ENETDOWN; - /* Serialize all requests */ mutex_lock(&ndev->req_lock); - rc = __nci_request(ndev, req, opt, timeout); + /* check the state after obtaing the lock against any races + * from nci_close_device when the device gets removed. + */ + if (test_bit(NCI_UP, &ndev->flags)) + rc = __nci_request(ndev, req, opt, timeout); + else + rc = -ENETDOWN; mutex_unlock(&ndev->req_lock); return rc; @@ -398,6 +401,11 @@ static int nci_open_device(struct nci_dev *ndev) mutex_lock(&ndev->req_lock); + if (test_bit(NCI_UNREG, &ndev->flags)) { + rc = -ENODEV; + goto done; + } + if (test_bit(NCI_UP, &ndev->flags)) { rc = -EALREADY; goto done; @@ -461,6 +469,10 @@ done: static int nci_close_device(struct nci_dev *ndev) { nci_req_cancel(ndev, ENODEV); + + /* This mutex needs to be held as a barrier for + * caller nci_unregister_device + */ mutex_lock(&ndev->req_lock); if (!test_and_clear_bit(NCI_UP, &ndev->flags)) { @@ -498,8 +510,8 @@ static int nci_close_device(struct nci_dev *ndev) /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); - /* Clear flags */ - ndev->flags = 0; + /* Clear flags except NCI_UNREG */ + ndev->flags &= BIT(NCI_UNREG); mutex_unlock(&ndev->req_lock); @@ -610,14 +622,14 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, struct nci_core_conn_create_cmd *cmd; struct core_conn_create_data data; + if (!number_destination_params) + return -EINVAL; + data.length = params_len + sizeof(struct nci_core_conn_create_cmd); cmd = kzalloc(data.length, GFP_KERNEL); if (!cmd) return -ENOMEM; - if (!number_destination_params) - return -EINVAL; - cmd->destination_type = destination_type; cmd->number_destination_params = number_destination_params; memcpy(cmd->params, params, params_len); @@ -1099,6 +1111,7 @@ EXPORT_SYMBOL(nci_allocate_device); void nci_free_device(struct nci_dev *ndev) { nfc_free_device(ndev->nfc_dev); + nci_hci_deallocate(ndev); kfree(ndev); } EXPORT_SYMBOL(nci_free_device); @@ -1178,6 +1191,12 @@ void nci_unregister_device(struct nci_dev *ndev) { struct nci_conn_info *conn_info, *n; + /* This set_bit is not protected with specialized barrier, + * However, it is fine because the mutex_lock(&ndev->req_lock); + * in nci_close_device() will help to emit one. + */ + set_bit(NCI_UNREG, &ndev->flags); + nci_close_device(ndev); destroy_workqueue(ndev->cmd_wq); diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 2aedac15cb59..309e8cebed55 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -798,3 +798,8 @@ struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev) return hdev; } + +void nci_hci_deallocate(struct nci_dev *ndev) +{ + kfree(ndev->hci_dev); +} diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index 9b6eb913d801..74e4d5e8c275 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -274,6 +274,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_id); if (conn_info) { list_del(&conn_info->list); + if (conn_info == ndev->rf_conn_info) + ndev->rf_conn_info = NULL; devm_kfree(&ndev->nfc_dev->dev, conn_info); } } diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 21d8875673a4..c3f2faa0210e 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -355,7 +355,7 @@ static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, nu->rx_packet_len = -1; nu->rx_skb = nci_skb_alloc(nu->ndev, NCI_MAX_PACKET_SIZE, - GFP_KERNEL); + GFP_ATOMIC); if (!nu->rx_skb) return -ENOMEM; } diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 04d4c388a7a8..4286b900a306 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -62,7 +62,10 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, + [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, + [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, + [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; @@ -629,8 +632,10 @@ static int nfc_genl_dump_devices_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; - nfc_device_iter_exit(iter); - kfree(iter); + if (iter) { + nfc_device_iter_exit(iter); + kfree(iter); + } return 0; } @@ -847,6 +852,7 @@ static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) if (!dev->polling) { device_unlock(&dev->dev); + nfc_put_device(dev); return -EINVAL; } @@ -1187,7 +1193,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); @@ -1362,8 +1368,10 @@ static int nfc_genl_dump_ses_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; - nfc_device_iter_exit(iter); - kfree(iter); + if (iter) { + nfc_device_iter_exit(iter); + kfree(iter); + } return 0; } diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index e386e6c90b17..2fba626a0125 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -117,7 +117,7 @@ static int rawsock_connect(struct socket *sock, struct sockaddr *_addr, if (addr->target_idx > dev->target_next_idx - 1 || addr->target_idx < dev->target_next_idx - dev->n_targets) { rc = -EINVAL; - goto error; + goto put_dev; } rc = nfc_activate_target(dev, addr->target_idx, addr->nfc_protocol); @@ -344,10 +344,13 @@ static int rawsock_create(struct net *net, struct socket *sock, if ((sock->type != SOCK_SEQPACKET) && (sock->type != SOCK_RAW)) return -ESOCKTNOSUPPORT; - if (sock->type == SOCK_RAW) + if (sock->type == SOCK_RAW) { + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; sock->ops = &rawsock_raw_ops; - else + } else { sock->ops = &rawsock_ops; + } sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 7cb8184ac165..b8f2ee52e64e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -137,11 +137,22 @@ static bool is_flow_key_valid(const struct sw_flow_key *key) return !!key->eth.type; } +static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, + __be16 ethertype) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be16 diff[] = { ~(hdr->h_proto), ethertype }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + hdr->h_proto = ethertype; +} + static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_mpls *mpls) { __be32 *new_mpls_lse; - struct ethhdr *hdr; /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) @@ -160,9 +171,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); - hdr = eth_hdr(skb); - hdr->h_proto = mpls->mpls_ethertype; - + update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); if (!skb->inner_protocol) skb_set_inner_protocol(skb, skb->protocol); skb->protocol = mpls->mpls_ethertype; @@ -193,7 +202,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, * field correctly in the presence of VLAN tags. */ hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); - hdr->h_proto = ethertype; + update_ethertype(skb, hdr, ethertype); if (eth_p_mpls(skb->protocol)) skb->protocol = ethertype; @@ -217,8 +226,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~(*stack), lse }; - skb->csum = ~csum_partial((char *)diff, sizeof(diff), - ~skb->csum); + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); } *stack = lse; @@ -686,16 +694,16 @@ static void ovs_fragment(struct net *net, struct vport *vport, } if (ethertype == htons(ETH_P_IP)) { - struct dst_entry ovs_dst; + struct rtable ovs_rt = { 0 }; unsigned long orig_dst; prepare_frag(vport, skb); - dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOCOUNT); - ovs_dst.dev = vport->dev; + ovs_rt.dst.dev = vport->dev; orig_dst = skb->_skb_refdst; - skb_dst_set_noref(skb, &ovs_dst); + skb_dst_set_noref(skb, &ovs_rt.dst); IPCB(skb)->frag_max_size = mru; ip_do_fragment(net, skb->sk, skb, ovs_vport_output); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index caa23ee913f0..82ca7fe7a163 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -725,9 +725,13 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, { size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); - /* OVS_FLOW_ATTR_UFID */ + /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback + * see ovs_nla_put_identifier() + */ if (sfid && ovs_identifier_is_ufid(sfid)) len += nla_total_size(sfid->ufid_len); + else + len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_KEY */ if (!sfid || should_fill_key(sfid, ufid_flags)) @@ -900,7 +904,10 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, cmd, ufid_flags); - BUG_ON(retval < 0); + if (WARN_ON_ONCE(retval < 0)) { + kfree_skb(skb); + skb = ERR_PTR(retval); + } return skb; } @@ -1318,7 +1325,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) OVS_FLOW_CMD_DEL, ufid_flags); rcu_read_unlock(); - BUG_ON(err < 0); + if (WARN_ON_ONCE(err < 0)) { + kfree_skb(reply); + goto out_free; + } ovs_notify(&dp_flow_genl_family, reply, info); } else { @@ -1326,6 +1336,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) } } +out_free: ovs_flow_free(flow, true); return 0; unlock: diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 1d055c559eaf..03378e75a67c 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -55,7 +55,7 @@ struct ovs_tunnel_info { FIELD_SIZEOF(struct sw_flow_key, recirc_id)) struct sw_flow_key { - u8 tun_opts[255]; + u8 tun_opts[IP_TUNNEL_OPTS_MAX]; u8 tun_opts_len; struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0dd9fc3f57e8..46bf6f84e0e6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -587,7 +587,8 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, msec = 1; div = speed / 1000; } - } + } else + return DEFAULT_PRB_RETIRE_TOV; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); @@ -1331,15 +1332,21 @@ static void packet_sock_destruct(struct sock *sk) static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { - u32 rxhash; + u32 *history = po->rollover->history; + u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) - if (po->rollover->history[i] == rxhash) + if (READ_ONCE(history[i]) == rxhash) count++; - po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + victim = prandom_u32() % ROLLOVER_HLEN; + + /* Avoid dirtying the cache line if possible */ + if (READ_ONCE(history[victim]) != rxhash) + WRITE_ONCE(history[victim], rxhash); + return count > (ROLLOVER_HLEN >> 1); } @@ -1598,13 +1605,9 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, if (copy_from_user(&fd, data, len)) return -EFAULT; - new = bpf_prog_get(fd); + new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); - if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(new); - return -EINVAL; - } __fanout_set_data_bpf(po->fanout, new); return 0; @@ -1702,6 +1705,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; list_add(&match->list, &fanout_list); } @@ -3160,6 +3164,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; + po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; @@ -3308,20 +3313,29 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + int copy_len; + /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); + copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); + copy_len = msg->msg_namelen; + if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { + memset(msg->msg_name + + offsetof(struct sockaddr_ll, sll_addr), + 0, sizeof(sll->sll_addr)); + msg->msg_namelen = sizeof(struct sockaddr_ll); + } } - memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, - msg->msg_namelen); + memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (pkt_sk(sk)->auxdata) { diff --git a/net/phonet/pep.c b/net/phonet/pep.c index f6aa532bcbf6..a734d47c5eb1 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -878,6 +878,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) err = pep_accept_conn(newsk, skb); if (err) { + __sock_put(sk); sock_put(newsk); newsk = NULL; goto drop; @@ -956,6 +957,8 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) ret = -EBUSY; else if (sk->sk_state == TCP_ESTABLISHED) ret = -EISCONN; + else if (!pn->pn_sk.sobject) + ret = -EADDRNOTAVAIL; else ret = pep_sock_enable(sk, NULL, 0); release_sock(sk); diff --git a/net/phonet/socket.c b/net/phonet/socket.c index d575ef4e9aa6..ffd5f2297584 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) rcu_read_unlock(); } -void pn_sock_hash(struct sock *sk) +int pn_sock_hash(struct sock *sk) { struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); mutex_lock(&pnsocks.lock); sk_add_node_rcu(sk, hlist); mutex_unlock(&pnsocks.lock); + + return 0; } EXPORT_SYMBOL(pn_sock_hash); @@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) pn->resource = spn->spn_resource; /* Enable RX on the socket */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); out_port: mutex_unlock(&port_mutex); out: diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d77e04473056..a88460058185 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -42,7 +42,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", - "s_ib_evt_handler_call", + "ib_evt_handler_call", "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", diff --git a/net/rds/recv.c b/net/rds/recv.c index 6275de19689c..1ff4bc3237f0 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -301,12 +301,13 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; - struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ + struct rds_rdma_notify cmsg; unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); int err = 0; + memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */ /* put_cmsg copies to user space and thus may sleep. We can't do this * with rs_lock held, so first grab as many notifications as we can stuff @@ -481,7 +482,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (rds_cmsg_recv(inc, msg)) { ret = -EFAULT; - goto out; + break; } rds_stats_inc(s_recv_delivered); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c10622a9321c..465756fe7958 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -110,7 +110,7 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* * This is the only path that sets tc->t_sock. Send and receive trust that - * it is set. The RDS_CONN_CONNECTED bit protects those paths from being + * it is set. The RDS_CONN_UP bit protects those paths from being * called while it isn't set. */ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index e353e3255206..5213cd781c24 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -115,24 +115,32 @@ int rds_tcp_accept_one(struct socket *sock) * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - if (rs_tcp->t_sock && - ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { - struct sock *nsk = new_sock->sk; - - nsk->sk_user_data = NULL; - nsk->sk_prot->disconnect(nsk, 0); - tcp_done(nsk); - new_sock = NULL; - ret = 0; - goto out; - } else if (rs_tcp->t_sock) { - rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); - conn->c_outgoing = 0; - } - rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + if (rs_tcp->t_sock) { + /* Need to resolve a duelling SYN between peers. + * We have an outstanding SYN to this peer, which may + * potentially have transitioned to the RDS_CONN_UP state, + * so we must quiesce any send threads before resetting + * c_transport_data. + */ + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + struct sock *nsk = new_sock->sk; + + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); + new_sock = NULL; + ret = 0; + goto out; + } else if (rs_tcp->t_sock) { + rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); + conn->c_outgoing = 0; + } + } rds_tcp_set_callbacks(new_sock, conn); - rds_connect_complete(conn); + rds_connect_complete(conn); /* marks RDS_CONN_UP */ new_sock = NULL; ret = 0; diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 71e1f0def5a5..8662a976f666 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -936,10 +936,13 @@ static void rfkill_sync_work(struct work_struct *work) int __must_check rfkill_register(struct rfkill *rfkill) { static unsigned long rfkill_no; - struct device *dev = &rfkill->dev; + struct device *dev; int error; - BUG_ON(!rfkill); + if (!rfkill) + return -EINVAL; + + dev = &rfkill->dev; mutex_lock(&rfkill_global_mutex); diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 344456206b70..0f371e50d9c4 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -99,10 +99,19 @@ static void rose_loopback_timer(unsigned long param) } if (frametype == ROSE_CALL_REQUEST) { - if ((dev = rose_dev_get(dest)) != NULL) { - if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) - kfree_skb(skb); - } else { + if (!rose_loopback_neigh->dev) { + kfree_skb(skb); + continue; + } + + dev = rose_dev_get(dest); + if (!dev) { + kfree_skb(skb); + continue; + } + + if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) { + dev_put(dev); kfree_skb(skb); } } else { diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 91d43ab3a961..f4ad63d6e540 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -897,7 +897,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) _enter(""); - if (optlen <= 0 || optlen > PAGE_SIZE - 1) + if (optlen <= 0 || optlen > PAGE_SIZE - 1 || rx->securities) return -EINVAL; description = kmalloc(optlen + 1, GFP_KERNEL); @@ -1114,8 +1114,9 @@ static long rxrpc_read(const struct key *key, break; default: /* we have a ticket we can't encode */ - BUG(); - continue; + pr_err("Unsupported key token type (%u)\n", + token->security_index); + return -ENOPKG; } _debug("token[%u]: toksize=%u", ntoks, toksize); @@ -1149,6 +1150,14 @@ static long rxrpc_read(const struct key *key, goto fault; \ xdr += (_l + 3) >> 2; \ } while(0) +#define ENCODE_BYTES(l, s) \ + do { \ + u32 _l = (l); \ + memcpy(xdr, (s), _l); \ + if (_l & 3) \ + memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ + xdr += (_l + 3) >> 2; \ + } while(0) #define ENCODE64(x) \ do { \ __be64 y = cpu_to_be64(x); \ @@ -1177,7 +1186,7 @@ static long rxrpc_read(const struct key *key, case RXRPC_SECURITY_RXKAD: ENCODE(token->kad->vice_id); ENCODE(token->kad->kvno); - ENCODE_DATA(8, token->kad->session_key); + ENCODE_BYTES(8, token->kad->session_key); ENCODE(token->kad->start); ENCODE(token->kad->expiry); ENCODE(token->kad->primary_flag); @@ -1227,8 +1236,9 @@ static long rxrpc_read(const struct key *key, break; default: - BUG(); - break; + pr_err("Unsupported key token type (%u)\n", + token->security_index); + return -ENOPKG; } ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 14c4e12c47b0..a1997f9a447a 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -533,7 +533,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + if (sk->sk_shutdown & SEND_SHUTDOWN) return -EPIPE; more = msg->msg_flags & MSG_MORE; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index b92beded7459..cb9affdb1e87 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -78,7 +78,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, release_sock(&rx->sk); if (continue_call) rxrpc_put_call(continue_call); - return -ENODATA; + return -EAGAIN; } } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index bd155e59be1c..143093b95bc3 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -51,9 +51,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); @@ -220,15 +222,10 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_ACT) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_ACT_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), nla_len(tb[TCA_ACT_BPF_NAME]), diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 3eef0215e53f..f674d330ebd1 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -100,15 +100,18 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } if (prog->exts_integrated) { - res->class = prog->res.class; - res->classid = qdisc_skb_cb(skb)->tc_classid; + res->class = 0; + res->classid = TC_H_MAJ(prog->res.classid) | + qdisc_skb_cb(skb)->tc_classid; ret = cls_bpf_exec_opcode(filter_res); if (ret == TC_ACT_UNSPEC) @@ -118,10 +121,12 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (filter_res == 0) continue; - - *res = prog->res; - if (filter_res != -1) + if (filter_res != -1) { + res->class = 0; res->classid = filter_res; + } else { + *res = prog->res; + } ret = tcf_exts_exec(skb, &prog->exts, res); if (ret < 0) @@ -267,15 +272,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_BPF_NAME]), nla_len(tb[TCA_BPF_NAME]), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e5a58c82728a..a97096a7f801 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -127,6 +127,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; + flow_dissector_init_keys(&skb_key.control, &skb_key.basic); fl_clear_masked_range(&skb_key, &head->mask); skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, @@ -350,12 +351,10 @@ static int fl_init_hashtable(struct cls_fl_head *head, #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) #define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) -#define FL_KEY_MEMBER_END_OFFSET(member) \ - (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) -#define FL_KEY_IN_RANGE(mask, member) \ - (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ - FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) +#define FL_KEY_IS_MASKED(mask, member) \ + memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ + 0, FL_KEY_MEMBER_SIZE(member)) \ #define FL_KEY_SET(keys, cnt, id, member) \ do { \ @@ -364,9 +363,9 @@ static int fl_init_hashtable(struct cls_fl_head *head, cnt++; \ } while(0); -#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ +#define FL_KEY_SET_IF_MASKED(mask, keys, cnt, id, member) \ do { \ - if (FL_KEY_IN_RANGE(mask, member)) \ + if (FL_KEY_IS_MASKED(mask, member)) \ FL_KEY_SET(keys, cnt, id, member); \ } while(0); @@ -378,14 +377,14 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); skb_flow_dissector_init(&head->dissector, keys, cnt); } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 08a3b0a6f5ab..4f13c771f36d 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -540,8 +540,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, fp = &b->ht[h]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { - if (pfp == f) { - *fp = f->next; + if (pfp == fold) { + rcu_assign_pointer(*fp, fold->next); break; } } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 9992dfac6938..7317a64fdb79 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -455,10 +455,8 @@ static u32 gen_tunnel(struct rsvp_head *data) static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, - [TCA_RSVP_DST] = { .type = NLA_BINARY, - .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_SRC] = { .type = NLA_BINARY, - .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) }, [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, }; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 040d853f48b9..3d891b11c077 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -267,6 +267,29 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->fall_through = p->fall_through; cp->tp = tp; + if (tb[TCA_TCINDEX_HASH]) + cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); + + if (tb[TCA_TCINDEX_MASK]) + cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); + + if (tb[TCA_TCINDEX_SHIFT]) { + cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); + if (cp->shift > 16) { + err = -EINVAL; + goto errout; + } + } + if (!cp->hash) { + /* Hash not specified, use perfect hash if the upper limit + * of the hashing index is below the threshold. + */ + if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) + cp->hash = (cp->mask >> cp->shift) + 1; + else + cp->hash = DEFAULT_HASH_SIZE; + } + if (p->perfect) { int i; @@ -274,7 +297,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, sizeof(*r) * cp->hash, GFP_KERNEL); if (!cp->perfect) goto errout; - for (i = 0; i < cp->hash; i++) + cp->alloc_hash = cp->hash; + for (i = 0; i < min(cp->hash, p->hash); i++) tcf_exts_init(&cp->perfect[i].exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); balloc = 1; @@ -286,15 +310,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (old_r) cr.res = r->res; - if (tb[TCA_TCINDEX_HASH]) - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - - if (tb[TCA_TCINDEX_MASK]) - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - - if (tb[TCA_TCINDEX_SHIFT]) - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - err = -EBUSY; /* Hash already allocated, make sure that we still meet the @@ -312,16 +327,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tb[TCA_TCINDEX_FALL_THROUGH]) cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - if (!cp->hash) { - /* Hash not specified, use perfect hash if the upper limit - * of the hashing index is below the threshold. - */ - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) - cp->hash = (cp->mask >> cp->shift) + 1; - else - cp->hash = DEFAULT_HASH_SIZE; - } - if (!cp->perfect && !cp->h) cp->alloc_hash = cp->hash; diff --git a/net/sched/ematch.c b/net/sched/ematch.c index fbb7ebfc58c6..d4d6f9c91e8c 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -242,6 +242,9 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { + err = -EINVAL; + if (em_hdr->flags & TCF_EM_SIMPLE) + goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; @@ -267,12 +270,12 @@ static int tcf_em_validate(struct tcf_proto *tp, } em->data = (unsigned long) v; } + em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; - em->datalen = data_len; em->net = net; err = 0; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a4b492bb7fe5..4ee2e9a3d12e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -391,7 +391,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta { struct qdisc_rate_table *rtab; - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + if (tab == NULL || r->rate == 0 || + r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) return NULL; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index e8dcf94a23c8..fee59e25929c 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -396,7 +396,8 @@ static void choke_reset(struct Qdisc *sch) qdisc_drop(skb, sch); } - memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); + if (q->tab) + memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); q->head = q->tail = 0; red_restart(&q->vars); } @@ -422,6 +423,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) struct sk_buff **old = NULL; unsigned int mask; u32 max_P; + u8 *stab; if (opt == NULL) return -EINVAL; @@ -437,8 +439,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; ctl = nla_data(tb[TCA_CHOKE_PARMS]); - - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + stab = nla_data(tb[TCA_CHOKE_STAB]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) @@ -491,7 +493,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_CHOKE_STAB]), + stab, max_P); red_set_vars(&q->vars); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index d6e3ad43cecb..06e42727590a 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -375,6 +375,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl->deficit = cl->quantum; } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return err; } @@ -405,6 +406,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } @@ -426,6 +428,7 @@ static unsigned int drr_drop(struct Qdisc *sch) if (cl->qdisc->ops->drop) { len = cl->qdisc->ops->drop(cl->qdisc); if (len > 0) { + sch->qstats.backlog -= len; sch->q.qlen--; if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); @@ -461,6 +464,7 @@ static void drr_reset_qdisc(struct Qdisc *sch) qdisc_reset(cl->qdisc); } } + sch->qstats.backlog = 0; sch->q.qlen = 0; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 5f8f6d94336c..f5d2c32dae24 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -404,7 +404,8 @@ static void dsmark_reset(struct Qdisc *sch) struct dsmark_qdisc_data *p = qdisc_priv(sch); pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - qdisc_reset(p->q); + if (p->q) + qdisc_reset(p->q); sch->qstats.backlog = 0; sch->q.qlen = 0; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 2e4bd2c0a50c..6c99b833f665 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -151,6 +151,9 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; + if (!q->ops->change) + return 0; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 117ed90c5f21..eb814ffc0902 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -668,6 +668,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, + [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -706,7 +707,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_QUANTUM]) { u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); - if (quantum > 0) + if (quantum > 0 && quantum <= (1 << 20)) q->quantum = quantum; else err = -EINVAL; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1800f7977595..70e0dfd21f04 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -588,7 +588,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; } - if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index eec6dc2d3152..357e52455be6 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -49,6 +49,7 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { q->gso_skb = skb; q->qstats.requeues++; + qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; /* it's still part of the queue */ __netif_schedule(q); @@ -92,6 +93,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; + qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } else skb = NULL; @@ -301,7 +303,12 @@ static void dev_watchdog(unsigned long arg) } } - if (some_queue_timedout) { + /* The noise is pissing off our CI and upstream doesn't + * move on the bug report: + * + * https://bugzilla.kernel.org/show_bug.cgi?id=196399 + */ + if (some_queue_timedout && 0) { WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", dev->name, netdev_drivername(dev), i); dev->netdev_ops->ndo_tx_timeout(dev); @@ -327,6 +334,7 @@ void __netdev_watchdog_up(struct net_device *dev) dev_hold(dev); } } +EXPORT_SYMBOL_GPL(__netdev_watchdog_up); static void dev_watchdog_up(struct net_device *dev) { @@ -624,18 +632,19 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, struct Qdisc *sch; if (!try_module_get(ops->owner)) - goto errout; + return NULL; sch = qdisc_alloc(dev_queue, ops); - if (IS_ERR(sch)) - goto errout; + if (IS_ERR(sch)) { + module_put(ops->owner); + return NULL; + } sch->parent = parentid; if (!ops->init || ops->init(sch, NULL) == 0) return sch; qdisc_destroy(sch); -errout: return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); @@ -962,6 +971,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; + r->mpu = conf->mpu; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); r->mult = 1; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index a5745cb2d014..2f73232031c6 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -389,7 +389,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) return -EINVAL; if (!q) { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d783d7cc3348..1ac9f9f03fe3 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1529,6 +1529,7 @@ hfsc_reset_qdisc(struct Qdisc *sch) q->eligible = RB_ROOT; INIT_LIST_HEAD(&q->droplist); qdisc_watchdog_cancel(&q->watchdog); + sch->qstats.backlog = 0; sch->q.qlen = 0; } @@ -1559,14 +1560,6 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) struct hfsc_sched *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; - struct hfsc_class *cl; - unsigned int i; - - sch->qstats.backlog = 0; - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) - sch->qstats.backlog += cl->qdisc->qstats.backlog; - } qopt.defcls = q->defcls; if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) @@ -1604,6 +1597,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl->qdisc->q.qlen == 1) set_active(cl, qdisc_pkt_len(skb)); + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -1672,6 +1666,7 @@ hfsc_dequeue(struct Qdisc *sch) qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; @@ -1695,6 +1690,7 @@ hfsc_drop(struct Qdisc *sch) } cl->qstats.drops++; qdisc_qstats_drop(sch); + sch->qstats.backlog -= len; sch->q.qlen--; return len; } diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index d9c84328e7eb..b34992808879 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -195,7 +195,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 66bccc5ff4ea..03c19adb2e61 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -355,7 +355,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, + &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index cef36ad691dd..be3d78357fbb 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -351,7 +351,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 0d4630b155fe..81ea3a87bc04 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -87,6 +87,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, qdisc); if (ret == NET_XMIT_SUCCESS) { + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -125,6 +126,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch) struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); if (skb) { qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } @@ -143,6 +145,7 @@ static unsigned int prio_drop(struct Qdisc *sch) for (prio = q->bands-1; prio >= 0; prio--) { qdisc = q->queues[prio]; if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { + sch->qstats.backlog -= len; sch->q.qlen--; return len; } @@ -159,6 +162,7 @@ prio_reset(struct Qdisc *sch) for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); + sch->qstats.backlog = 0; sch->q.qlen = 0; q->enable_flow = 1; } @@ -341,7 +345,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8d2d8d953432..d466fab84261 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1150,6 +1150,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) if (!skb) return NULL; + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_bstats_update(sch, skb); @@ -1250,6 +1251,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } bstats_update(&cl->bstats, skb); + qdisc_qstats_backlog_inc(sch, skb); ++sch->q.qlen; agg = cl->agg; @@ -1477,10 +1479,8 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; - if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES) - max_classes = QFQ_MAX_AGG_CLASSES; - else - max_classes = qdisc_dev(sch)->tx_queue_len + 1; + max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1, + QFQ_MAX_AGG_CLASSES); /* max_cl_shift = floor(log_2(max_classes)) */ max_cl_shift = __fls(max_classes); q->max_agg_classes = 1<<max_cl_shift; @@ -1516,6 +1516,7 @@ static void qfq_reset_qdisc(struct Qdisc *sch) qdisc_reset(cl->qdisc); } } + sch->qstats.backlog = 0; sch->q.qlen = 0; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 4bf2b599ef98..ac85792038c4 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -188,6 +188,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) struct Qdisc *child = NULL; int err; u32 max_P; + u8 *stab; if (opt == NULL) return -EINVAL; @@ -203,7 +204,9 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + stab = nla_data(tb[TCA_RED_STAB]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Scell_log, stab)) return -EINVAL; if (ctl->limit > 0) { @@ -225,7 +228,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_RED_STAB]), + stab, max_P); red_set_vars(&q->vars); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 10c0b184cdbe..624b5e6fa52f 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -400,6 +400,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) enqueue: ret = qdisc_enqueue(skb, child); if (likely(ret == NET_XMIT_SUCCESS)) { + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; increment_qlen(skb, q); } else if (net_xmit_drop_count(ret)) { @@ -428,6 +429,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch) if (skb) { qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; decrement_qlen(skb, q); } @@ -450,6 +452,7 @@ static void sfb_reset(struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); + sch->qstats.backlog = 0; sch->q.qlen = 0; q->slot = 0; q->double_buffering = false; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index e2e4ebc0c4c3..0b27487fd07d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -635,8 +635,17 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; + + /* slot->allot is a short, make sure quantum is not too big. */ + if (ctl->quantum) { + unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum); + + if (scaled <= 0 || scaled > SHRT_MAX) + return -EINVAL; + } + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, - ctl_v1->Wlog)) + ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 05c7a66f64da..87dee4deb66e 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -197,6 +197,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -207,6 +208,7 @@ static unsigned int tbf_drop(struct Qdisc *sch) unsigned int len = 0; if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->qstats.backlog -= len; sch->q.qlen--; qdisc_qstats_drop(sch); } @@ -253,6 +255,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) q->t_c = now; q->tokens = toks; q->ptokens = ptoks; + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); @@ -284,6 +287,7 @@ static void tbf_reset(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); + sch->qstats.backlog = 0; sch->q.qlen = 0; q->t_c = ktime_get_ns(); q->tokens = q->buffer; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index e02687185a59..a7ecf626e998 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -138,6 +138,9 @@ teql_destroy(struct Qdisc *sch) struct teql_sched_data *dat = qdisc_priv(sch); struct teql_master *master = dat->m; + if (!master) + return; + prev = master->slaves; if (prev) { do { diff --git a/net/sctp/associola.c b/net/sctp/associola.c index f085b01b6603..aa38578fdfcd 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1290,7 +1290,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, if (score_curr > score_best) return curr; else if (score_curr == score_best) - return sctp_trans_elect_tie(curr, best); + return sctp_trans_elect_tie(best, curr); else return best; } @@ -1575,12 +1575,15 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, sctp_scope_t scope, gfp_t gfp) { + struct sock *sk = asoc->base.sk; int flags; /* Use scoping rules to determine the subset of addresses from * the endpoint. */ - flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; + flags = (PF_INET6 == sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; + if (!inet_v6_ipv6only(sk)) + flags |= SCTP_ADDR4_ALLOWED; if (asoc->peer.ipv4_address) flags |= SCTP_ADDR4_PEERSUPP; if (asoc->peer.ipv6_address) diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 1543e39f47c3..04cd87d26ed1 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -496,6 +496,7 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) out_err: /* Clean up any successful allocations */ sctp_auth_destroy_hmacs(ep->auth_hmacs); + ep->auth_hmacs = NULL; return -ENOMEM; } diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 871cdf9567e6..40fd399a1035 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -284,19 +284,15 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); - if (unlikely(!af)) { + if (unlikely(!af) || + !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; - sctp_bind_addr_clean(bp); - break; + goto out_err; } - af->from_addr_param(&addr, rawaddr, htons(port), 0); retval = sctp_add_bind_addr(bp, &addr, SCTP_ADDR_SRC, gfp); - if (retval) { - /* Can't finish building the list, clean up. */ - sctp_bind_addr_clean(bp); - break; - } + if (retval) + goto out_err; len = ntohs(param->length); addrs_len -= len; @@ -304,6 +300,12 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, } return retval; + +out_err: + if (retval) + sctp_bind_addr_clean(bp); + + return retval; } /******************************************************************** @@ -449,6 +451,7 @@ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, * well as the remote peer. */ if ((((AF_INET == addr->sa.sa_family) && + (flags & SCTP_ADDR4_ALLOWED) && (flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && diff --git a/net/sctp/input.c b/net/sctp/input.c index 71c2ef84c5b0..3f0b8aafc21a 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -448,7 +448,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) - sctp_association_hold(asoc); + sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); @@ -457,7 +457,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, "encountered!\n", __func__); if (del_timer(&t->proto_unreach_timer)) - sctp_association_put(asoc); + sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), @@ -972,7 +972,8 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, if (!af) continue; - af->from_addr_param(paddr, params.addr, sh->source, 0); + if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) + continue; asoc = __sctp_lookup_association(net, laddr, paddr, &transport); if (asoc) @@ -1008,6 +1009,9 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( union sctp_addr_param *param; union sctp_addr paddr; + if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) + return NULL; + /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); @@ -1015,7 +1019,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( if (unlikely(!af)) return NULL; - af->from_addr_param(&paddr, param, peer_port, 0); + if (af->from_addr_param(&paddr, param, peer_port, 0)) + return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp); } @@ -1086,7 +1091,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, ch = (sctp_chunkhdr_t *) ch_end; chunk_num++; - } while (ch_end < skb_tail_pointer(skb)); + } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ae619cffc3a9..62c729402a04 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -234,7 +234,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, { struct sctp_association *asoc = t->asoc; struct dst_entry *dst = NULL; - struct flowi6 *fl6 = &fl->u.ip6; + struct flowi _fl; + struct flowi6 *fl6 = &_fl.u.ip6; struct sctp_bind_addr *bp; struct ipv6_pinfo *np = inet6_sk(sk); struct sctp_sockaddr_entry *laddr; @@ -244,7 +245,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, __u8 matchlen = 0; sctp_scope_t scope; - memset(fl6, 0, sizeof(struct flowi6)); + memset(&_fl, 0, sizeof(_fl)); fl6->daddr = daddr->v6.sin6_addr; fl6->fl6_dport = daddr->v6.sin6_port; fl6->flowi6_proto = IPPROTO_SCTP; @@ -267,9 +268,12 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); - if (!asoc || saddr) + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + if (!asoc || saddr) { + t->dst = dst; + memcpy(fl, &_fl, sizeof(_fl)); goto out; + } bp = &asoc->base.bind_addr; scope = sctp_scope(daddr); @@ -292,6 +296,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if ((laddr->a.sa.sa_family == AF_INET6) && (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) { rcu_read_unlock(); + t->dst = dst; + memcpy(fl, &_fl, sizeof(_fl)); goto out; } } @@ -320,7 +326,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->saddr = laddr->a.v6.sin6_addr; fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); - bdst = ip6_dst_lookup_flow(sk, fl6, final_p); + bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(bdst)) continue; @@ -330,6 +336,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; + t->dst = dst; + memcpy(fl, &_fl, sizeof(_fl)); break; } @@ -343,6 +351,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, dst_release(dst); dst = bdst; matchlen = bmatchlen; + t->dst = dst; + memcpy(fl, &_fl, sizeof(_fl)); } rcu_read_unlock(); @@ -351,14 +361,12 @@ out: struct rt6_info *rt; rt = (struct rt6_info *)dst; - t->dst = dst; t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, - &fl6->saddr); + &fl->u.ip6.saddr); } else { t->dst = NULL; - pr_debug("no route\n"); } } @@ -480,15 +488,20 @@ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v6_from_addr_param(union sctp_addr *addr, +static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) + return false; + addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; + + return true; } /* Initialize an address parameter from a sctp_addr and return the length diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8816e49fd88b..510b805aab2d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -210,6 +210,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, * sock as well as the remote peer. */ if ((((AF_INET == addr->a.sa.sa_family) && + (copy_flags & SCTP_ADDR4_ALLOWED) && (copy_flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->a.sa.sa_family) && (copy_flags & SCTP_ADDR6_ALLOWED) && @@ -255,6 +256,7 @@ static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk) addr->v4.sin_family = AF_INET; addr->v4.sin_port = 0; addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr; + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ @@ -270,13 +272,19 @@ static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v4_from_addr_param(union sctp_addr *addr, +static bool sctp_v4_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param)) + return false; + addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); + + return true; } /* Initialize an address parameter from a sctp_addr and return the length @@ -301,6 +309,7 @@ static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4, saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = port; saddr->v4.sin_addr.s_addr = fl4->saddr; + memset(saddr->v4.sin_zero, 0, sizeof(saddr->v4.sin_zero)); } /* Compare two addresses exactly. */ @@ -323,6 +332,7 @@ static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port) addr->v4.sin_family = AF_INET; addr->v4.sin_addr.s_addr = htonl(INADDR_ANY); addr->v4.sin_port = port; + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Is this a wildcard address? */ @@ -406,7 +416,8 @@ static sctp_scope_t sctp_v4_scope(union sctp_addr *addr) retval = SCTP_SCOPE_LINK; } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || ipv4_is_private_172(addr->v4.sin_addr.s_addr) || - ipv4_is_private_192(addr->v4.sin_addr.s_addr)) { + ipv4_is_private_192(addr->v4.sin_addr.s_addr) || + ipv4_is_test_198(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_PRIVATE; } else { retval = SCTP_SCOPE_GLOBAL; @@ -424,14 +435,15 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, { struct sctp_association *asoc = t->asoc; struct rtable *rt; - struct flowi4 *fl4 = &fl->u.ip4; + struct flowi _fl; + struct flowi4 *fl4 = &_fl.u.ip4; struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *laddr; struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; - memset(fl4, 0x0, sizeof(struct flowi4)); + memset(&_fl, 0x0, sizeof(_fl)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; @@ -449,8 +461,11 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, &fl4->saddr); rt = ip_route_output_key(sock_net(sk), fl4); - if (!IS_ERR(rt)) + if (!IS_ERR(rt)) { dst = &rt->dst; + t->dst = dst; + memcpy(fl, &_fl, sizeof(_fl)); + } /* If there is no association or if a source address is passed, no * more validation is required. @@ -513,27 +528,33 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); if (!odev || odev->ifindex != fl4->flowi4_oif) { - if (!dst) + if (!dst) { dst = &rt->dst; - else + t->dst = dst; + memcpy(fl, &_fl, sizeof(_fl)); + } else { dst_release(&rt->dst); + } continue; } dst_release(dst); dst = &rt->dst; + t->dst = dst; + memcpy(fl, &_fl, sizeof(_fl)); break; } out_unlock: rcu_read_unlock(); out: - t->dst = dst; - if (dst) + if (dst) { pr_debug("rt_dst:%pI4, rt_src:%pI4\n", - &fl4->daddr, &fl4->saddr); - else + &fl->u.ip4.daddr, &fl->u.ip4.saddr); + } else { + t->dst = NULL; pr_debug("no route\n"); + } } /* For v4, the source address is cached in the route entry(dst). So no need diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 509e9426a056..d31e0d6c641b 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -857,7 +857,11 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, sctp_shutdownhdr_t shut; __u32 ctsn; - ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + if (chunk && chunk->asoc) + ctsn = sctp_tsnmap_get_ctsn(&chunk->asoc->peer.tsn_map); + else + ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, @@ -2142,9 +2146,16 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_SET_PRIMARY: - if (net->sctp.addip_enable) - break; - goto fallthrough; + if (!net->sctp.addip_enable) + goto fallthrough; + + if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + + sizeof(struct sctp_paramhdr)) { + sctp_process_inv_paramlength(asoc, param.p, + chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ @@ -2322,11 +2333,13 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, /* Process the initialization parameters. */ sctp_walk_params(param, peer_init, init_hdr.params) { - if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || - param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { + if (!src_match && + (param.p->type == SCTP_PARAM_IPV4_ADDRESS || + param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, - chunk->sctp_hdr->source, 0); + if (!af->from_addr_param(&addr, param.addr, + chunk->sctp_hdr->source, 0)) + continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } @@ -2520,7 +2533,8 @@ static int sctp_process_param(struct sctp_association *asoc, break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) + break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) @@ -2613,15 +2627,13 @@ do_addr_param: addr_param = param.v + sizeof(sctp_addip_param_t); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - if (af == NULL) + if (!af) break; - af->from_addr_param(&addr, addr_param, - htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, + htons(asoc->peer.port), 0)) + break; - /* if the address is invalid, we can't process it. - * XXX: see spec for what to do. - */ if (!af->addr_valid(&addr, NULL, NULL)) break; @@ -3031,7 +3043,8 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; - af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) + return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. @@ -3115,7 +3128,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, * primary. */ if (af->is_any(&addr)) - memcpy(&addr.v4, sctp_source(asconf), sizeof(addr)); + memcpy(&addr, sctp_source(asconf), sizeof(addr)); peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) @@ -3297,7 +3310,8 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - af->from_addr_param(&addr, addr_param, htons(bp->port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) + return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 7c220e905168..fc3d8ca21f6e 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -416,7 +416,7 @@ void sctp_generate_proto_unreach_event(unsigned long data) /* Try again later. */ if (!mod_timer(&transport->proto_unreach_timer, jiffies + (HZ/20))) - sctp_association_hold(asoc); + sctp_transport_hold(transport); goto out_unlock; } @@ -432,7 +432,7 @@ void sctp_generate_proto_unreach_event(unsigned long data) out_unlock: bh_unlock_sock(sk); - sctp_association_put(asoc); + sctp_transport_put(transport); } @@ -1333,8 +1333,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, /* Generate an INIT ACK chunk. */ new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, 0); - if (!new_obj) - goto nomem; + if (!new_obj) { + error = -ENOMEM; + break; + } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); @@ -1356,7 +1358,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, if (!new_obj) { if (cmd->obj.chunk) sctp_chunk_free(cmd->obj.chunk); - goto nomem; + error = -ENOMEM; + break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); @@ -1403,8 +1406,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, /* Generate a SHUTDOWN chunk. */ new_obj = sctp_make_shutdown(asoc, chunk); - if (!new_obj) - goto nomem; + if (!new_obj) { + error = -ENOMEM; + break; + } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; @@ -1733,11 +1738,17 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, break; } - if (error) + if (error) { + cmd = sctp_next_cmd(commands); + while (cmd) { + if (cmd->verb == SCTP_CMD_REPLY) + sctp_chunk_free(cmd->obj.chunk); + cmd = sctp_next_cmd(commands); + } break; + } } -out: /* If this is in response to a received chunk, wait until * we are done with the packet to open the queue so that we don't * send multiple packets in response to a single request. @@ -1748,8 +1759,5 @@ out: } else if (local_cork) error = sctp_outq_uncork(&asoc->outqueue); return error; -nomem: - error = -ENOMEM; - goto out; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index df9ac3746c1b..53bb631ec490 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -177,6 +177,16 @@ sctp_chunk_length_valid(struct sctp_chunk *chunk, return 1; } +/* Check for format error in an ABORT chunk */ +static inline bool sctp_err_chunk_valid(struct sctp_chunk *chunk) +{ + struct sctp_errhdr *err; + + sctp_walk_errors(err, chunk->chunk_hdr); + + return (void *)err == (void *)chunk->chunk_end; +} + /********************************************************** * These are the state functions for handling chunk events. **********************************************************/ @@ -1783,12 +1793,13 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net, /* Update the content of current association. */ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); - if (sctp_state(asoc, SHUTDOWN_PENDING) && + if ((sctp_state(asoc, SHUTDOWN_PENDING) || + sctp_state(asoc, SHUTDOWN_SENT)) && (sctp_sstate(asoc->base.sk, CLOSING) || sock_flag(asoc->base.sk, SOCK_DEAD))) { - /* if were currently in SHUTDOWN_PENDING, but the socket - * has been closed by user, don't transition to ESTABLISHED. - * Instead trigger SHUTDOWN bundled with COOKIE_ACK. + /* If the socket has been closed by user, don't + * transition to ESTABLISHED. Instead trigger SHUTDOWN + * bundled with COOKIE_ACK. */ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, @@ -1840,7 +1851,8 @@ static sctp_disposition_t sctp_sf_do_dupcook_b(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_ESTABLISHED)); - SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB); + if (asoc->state < SCTP_STATE_ESTABLISHED) + SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB); sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); repl = sctp_make_cookie_ack(new_asoc, chunk); @@ -2159,6 +2171,9 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort( sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands); } @@ -2201,6 +2216,9 @@ sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net, sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Stop the T2-shutdown timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); @@ -2466,6 +2484,9 @@ sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net, sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands); } @@ -2482,15 +2503,9 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net, /* See if we have an error cause code in the chunk. */ len = ntohs(chunk->chunk_hdr->length); - if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) { - - sctp_errhdr_t *err; - sctp_walk_errors(err, chunk->chunk_hdr); - if ((void *)err != (void *)chunk->chunk_end) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) error = ((sctp_errhdr_t *)chunk->skb->data)->cause; - } sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET)); /* ASSOC_FAILED will DELETE_TCB. */ @@ -4322,6 +4337,9 @@ sctp_disposition_t sctp_sf_violation(struct net *net, { struct sctp_chunk *chunk = arg; + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, @@ -6012,6 +6030,7 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net, * yet. */ switch (chunk->chunk_hdr->type) { + case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: { sctp_initack_chunk_t *initack; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 2b6c88b9a038..4d60f1e42f42 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -185,13 +185,13 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc, list_for_each_entry(chunk, &t->transmitted, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->retransmit, list) + list_for_each_entry(chunk, &q->retransmit, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->sacked, list) + list_for_each_entry(chunk, &q->sacked, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->abandoned, list) + list_for_each_entry(chunk, &q->abandoned, transmitted_list) cb(chunk); list_for_each_entry(chunk, &q->out_chunk_list, list) @@ -352,6 +352,18 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, return af; } +static void sctp_auto_asconf_init(struct sctp_sock *sp) +{ + struct net *net = sock_net(&sp->inet.sk); + + if (net->sctp.default_auto_asconf) { + spin_lock(&net->sctp.addr_wq_lock); + list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); + spin_unlock(&net->sctp.addr_wq_lock); + sp->do_auto_asconf = 1; + } +} + /* Bind a local address either to an endpoint or to an association. */ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) { @@ -414,8 +426,10 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) } /* Refresh ephemeral port. */ - if (!bp->port) + if (!bp->port) { bp->port = inet_sk(sk)->inet_num; + sctp_auto_asconf_init(sp); + } /* Add the address to the bind address list. * Use GFP_ATOMIC since BHs will be disabled. @@ -4161,19 +4175,6 @@ static int sctp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); - /* Nothing can fail after this block, otherwise - * sctp_destroy_sock() will be called without addr_wq_lock held - */ - if (net->sctp.default_auto_asconf) { - spin_lock(&sock_net(sk)->sctp.addr_wq_lock); - list_add_tail(&sp->auto_asconf_list, - &net->sctp.auto_asconf_splist); - sp->do_auto_asconf = 1; - spin_unlock(&sock_net(sk)->sctp.addr_wq_lock); - } else { - sp->do_auto_asconf = 0; - } - local_bh_enable(); return 0; @@ -6170,9 +6171,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, return retval; } -static void sctp_hash(struct sock *sk) +static int sctp_hash(struct sock *sk) { /* STUB */ + return 0; } static void sctp_unhash(struct sock *sk) @@ -6206,8 +6208,6 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) pr_debug("%s: begins, snum:%d\n", __func__, snum); - local_bh_disable(); - if (snum == 0) { /* Search for an available port. */ int low, high, remaining, index; @@ -6226,20 +6226,21 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) continue; index = sctp_phashfn(sock_net(sk), rover); head = &sctp_port_hashtable[index]; - spin_lock(&head->lock); + spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && net_eq(sock_net(sk), pp->net)) goto next; break; next: - spin_unlock(&head->lock); + spin_unlock_bh(&head->lock); + cond_resched(); } while (--remaining > 0); /* Exhausted local port range during search? */ ret = 1; if (remaining <= 0) - goto fail; + return ret; /* OK, here is the one we will use. HEAD (the port * hash table list entry) is non-NULL and we hold it's @@ -6254,7 +6255,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) * port iterator, pp being NULL. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), snum)]; - spin_lock(&head->lock); + spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) { if ((pp->port == snum) && net_eq(pp->net, sock_net(sk))) goto pp_found; @@ -6338,10 +6339,7 @@ success: ret = 0; fail_unlock: - spin_unlock(&head->lock); - -fail: - local_bh_enable(); + spin_unlock_bh(&head->lock); return ret; } @@ -7341,6 +7339,8 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_bind_addr_dup(&newsp->ep->base.bind_addr, &oldsp->ep->base.bind_addr, GFP_KERNEL); + sctp_auto_asconf_init(newsp); + /* Move any messages in the old socket's receive queue that are for the * peeled off association to the new socket's receive queue. */ diff --git a/net/sctp/transport.c b/net/sctp/transport.c index aab9e3f29755..f8041fb03290 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net, */ peer->rto = msecs_to_jiffies(net->sctp.rto_initial); - peer->last_time_heard = ktime_get(); + peer->last_time_heard = ktime_set(0, 0); peer->last_time_ecne_reduced = jiffies; peer->param_flags = SPP_HB_DISABLE | @@ -148,7 +148,7 @@ void sctp_transport_free(struct sctp_transport *transport) /* Delete the ICMP proto unreachable timer if it's active. */ if (del_timer(&transport->proto_unreach_timer)) - sctp_association_put(transport->asoc); + sctp_transport_put(transport); sctp_transport_put(transport); } diff --git a/net/socket.c b/net/socket.c index 690b4de0377e..07b1143d03dd 100644 --- a/net/socket.c +++ b/net/socket.c @@ -469,7 +469,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) if (f.file) { sock = sock_from_file(f.file, err); if (likely(sock)) { - *fput_needed = f.flags; + *fput_needed = f.flags & FDPUT_FPUT; return sock; } fdput(f); @@ -2597,15 +2597,6 @@ out_fs: core_initcall(sock_init); /* early initcall */ -static int __init jit_init(void) -{ -#ifdef CONFIG_BPF_JIT_ALWAYS_ON - bpf_jit_enable = 1; -#endif - return 0; -} -pure_initcall(jit_init); - #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { @@ -3216,6 +3207,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCSARP: case SIOCGARP: case SIOCDARP: + case SIOCOUTQNSD: case SIOCATMARK: return sock_do_ioctl(net, sock, cmd, arg); } @@ -3388,3 +3380,49 @@ int sockev_unregister_notify(struct notifier_block *nb) return blocking_notifier_chain_unregister(&sockev_notifier_list, nb); } EXPORT_SYMBOL(sockev_unregister_notify); + +/* This routine returns the IP overhead imposed by a socket i.e. + * the length of the underlying IP header, depending on whether + * this is an IPv4 or IPv6 socket and the length from IP options turned + * on at the socket. Assumes that the caller has a lock on the socket. + */ +u32 kernel_sock_ip_overhead(struct sock *sk) +{ + struct inet_sock *inet; + struct ip_options_rcu *opt; + u32 overhead = 0; + bool owned_by_user; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6_pinfo *np; + struct ipv6_txoptions *optv6 = NULL; +#endif /* IS_ENABLED(CONFIG_IPV6) */ + + if (!sk) + return overhead; + + owned_by_user = sock_owned_by_user(sk); + switch (sk->sk_family) { + case AF_INET: + inet = inet_sk(sk); + overhead += sizeof(struct iphdr); + opt = rcu_dereference_protected(inet->inet_opt, + owned_by_user); + if (opt) + overhead += opt->opt.optlen; + return overhead; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + np = inet6_sk(sk); + overhead += sizeof(struct ipv6hdr); + if (np) + optv6 = rcu_dereference_protected(np->opt, + owned_by_user); + if (optv6) + overhead += (optv6->opt_flen + optv6->opt_nflen); + return overhead; +#endif /* IS_ENABLED(CONFIG_IPV6) */ + default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */ + return overhead; + } +} +EXPORT_SYMBOL(kernel_sock_ip_overhead); diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 2e0a6f92e563..7404f02702a1 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -81,11 +81,11 @@ static size_t rpc_ntop6(const struct sockaddr *sap, rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u", IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id); - if (unlikely((size_t)rc > sizeof(scopebuf))) + if (unlikely((size_t)rc >= sizeof(scopebuf))) return 0; len += rc; - if (unlikely(len > buflen)) + if (unlikely(len >= buflen)) return 0; strcat(buf, scopebuf); @@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, scope_id = dev->ifindex; dev_put(dev); } else { - if (kstrtou32(p, 10, &scope_id) == 0) { + if (kstrtou32(p, 10, &scope_id) != 0) { kfree(p); return 0; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 62fca77bf3c7..7bde2976307e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -53,6 +53,7 @@ #include <asm/uaccess.h> #include <linux/hashtable.h> +#include "auth_gss_internal.h" #include "../netns.h" static const struct rpc_authops authgss_ops; @@ -147,35 +148,6 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } -static const void * -simple_get_bytes(const void *p, const void *end, void *res, size_t len) -{ - const void *q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - memcpy(res, p, len); - return q; -} - -static inline const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) -{ - const void *q; - unsigned int len; - - p = simple_get_bytes(p, end, &len, sizeof(len)); - if (IS_ERR(p)) - return p; - q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - dest->data = kmemdup(p, len, GFP_NOFS); - if (unlikely(dest->data == NULL)) - return ERR_PTR(-ENOMEM); - dest->len = len; - return q; -} - static struct gss_cl_ctx * gss_cred_get_ctx(struct rpc_cred *cred) { diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h new file mode 100644 index 000000000000..f6d9631bd9d0 --- /dev/null +++ b/net/sunrpc/auth_gss/auth_gss_internal.h @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * linux/net/sunrpc/auth_gss/auth_gss_internal.h + * + * Internal definitions for RPCSEC_GSS client authentication + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + */ +#include <linux/err.h> +#include <linux/string.h> +#include <linux/sunrpc/xdr.h> + +static inline const void * +simple_get_bytes(const void *p, const void *end, void *res, size_t len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static inline const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + if (len) { + dest->data = kmemdup(p, len, GFP_NOFS); + if (unlikely(dest->data == NULL)) + return ERR_PTR(-ENOMEM); + } else + dest->data = NULL; + dest->len = len; + return q; +} diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 28db442a0034..89e616da161f 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -45,6 +45,8 @@ #include <linux/crypto.h> #include <linux/sunrpc/gss_krb5_enctypes.h> +#include "auth_gss_internal.h" + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif @@ -186,35 +188,6 @@ get_gss_krb5_enctype(int etype) return NULL; } -static const void * -simple_get_bytes(const void *p, const void *end, void *res, int len) -{ - const void *q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - memcpy(res, p, len); - return q; -} - -static const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) -{ - const void *q; - unsigned int len; - - p = simple_get_bytes(p, end, &len, sizeof(len)); - if (IS_ERR(p)) - return p; - q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - res->data = kmemdup(p, len, GFP_NOFS); - if (unlikely(res->data == NULL)) - return ERR_PTR(-ENOMEM); - res->len = len; - return q; -} - static inline const void * get_key(const void *p, const void *end, struct krb5_ctx *ctx, struct crypto_blkcipher **res) diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 7063d856a598..e2fd931ddb22 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -61,6 +61,8 @@ gss_mech_free(struct gss_api_mech *gm) for (i = 0; i < gm->gm_pf_num; i++) { pf = &gm->gm_pfs[i]; + if (pf->domain) + auth_domain_put(pf->domain); kfree(pf->auth_domain_name); pf->auth_domain_name = NULL; } @@ -83,6 +85,7 @@ make_auth_domain_name(char *name) static int gss_mech_svc_setup(struct gss_api_mech *gm) { + struct auth_domain *dom; struct pf_desc *pf; int i, status; @@ -92,10 +95,13 @@ gss_mech_svc_setup(struct gss_api_mech *gm) status = -ENOMEM; if (pf->auth_domain_name == NULL) goto out; - status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor, - pf->auth_domain_name); - if (status) + dom = svcauth_gss_register_pseudoflavor( + pf->pseudoflavor, pf->auth_domain_name); + if (IS_ERR(dom)) { + status = PTR_ERR(dom); goto out; + } + pf->domain = dom; } return 0; out: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index b5291ea54a3d..daf0c1ea3917 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -772,7 +772,7 @@ u32 svcauth_gss_flavor(struct auth_domain *dom) EXPORT_SYMBOL_GPL(svcauth_gss_flavor); -int +struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; @@ -789,21 +789,23 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; - stat = 0; test = auth_domain_lookup(name, &new->h); - if (test != &new->h) { /* Duplicate registration */ + if (test != &new->h) { + pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n", + name); + stat = -EADDRINUSE; auth_domain_put(test); - kfree(new->h.name); - goto out_free_dom; + goto out_free_name; } - return 0; + return test; +out_free_name: + kfree(new->h.name); out_free_dom: kfree(new); out: - return stat; + return ERR_PTR(stat); } - EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); static inline int @@ -1173,6 +1175,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, dprintk("RPC: No creds found!\n"); goto out; } else { + struct timespec64 boot; /* steal creds */ rsci.cred = ud->creds; @@ -1193,6 +1196,9 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, &expiry, GFP_KERNEL); if (status) goto out; + + getboottime64(&boot); + expiry -= boot.tv_sec; } rsci.h.expiry_time = expiry; @@ -1691,11 +1697,14 @@ static int svcauth_gss_release(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; - struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct rpc_gss_wire_cred *gc; struct xdr_buf *resbuf = &rqstp->rq_res; int stat = -EINVAL; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); + if (!gsd) + goto out; + gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; /* Release can be called twice, but we only wrap once. */ @@ -1736,10 +1745,10 @@ out_err: if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; - if (gsd->rsci) + if (gsd && gsd->rsci) { cache_put(&gsd->rsci->h, sn->rsc_cache); - gsd->rsci = NULL; - + gsd->rsci = NULL; + } return stat; } @@ -1836,7 +1845,7 @@ gss_svc_init_net(struct net *net) goto out2; return 0; out2: - destroy_use_gss_proxy_proc_entry(net); + rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index a8ab98b53a3a..570832949f91 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,9 +54,6 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } -static inline int cache_is_valid(struct cache_head *h); -static void cache_fresh_locked(struct cache_head *head, time_t expiry, - struct cache_detail *detail); static void cache_fresh_unlocked(struct cache_head *head, struct cache_detail *detail); @@ -101,9 +98,6 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, if (cache_is_expired(detail, tmp)) { hlist_del_init(&tmp->cache_list); detail->entries --; - if (cache_is_valid(tmp) == -EAGAIN) - set_bit(CACHE_NEGATIVE, &tmp->flags); - cache_fresh_locked(tmp, 0, detail); freeme = tmp; break; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f28aeb2cfd32..cc308f7a2c02 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1826,6 +1826,14 @@ call_connect_status(struct rpc_task *task) task->tk_status = 0; switch (status) { case -ECONNREFUSED: + /* A positive refusal suggests a rebind is needed. */ + if (RPC_IS_SOFTCONN(task)) + break; + if (clnt->cl_autobind) { + rpc_force_rebind(clnt); + task->tk_action = call_bind; + return; + } case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 9103dd15511c..831a8cf231a1 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1347,6 +1347,7 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) q.len = strlen(gssd_dummy_clnt_dir[0].name); clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); if (!clnt_dentry) { + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(-ENOENT); goto out; } @@ -1387,7 +1388,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root, *gssd_dentry; - struct net *net = data; + struct net *net = get_net(sb->s_fs_info); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1420,7 +1421,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb); if (err) goto err_depopulate; - sb->s_fs_info = get_net(net); mutex_unlock(&sn->pipefs_sb_lock); return 0; @@ -1449,7 +1449,8 @@ static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super); } static void rpc_kill_sb(struct super_block *sb) @@ -1469,9 +1470,9 @@ static void rpc_kill_sb(struct super_block *sb) RPC_PIPEFS_UMOUNT, sb); mutex_unlock(&sn->pipefs_sb_lock); - put_net(net); out: kill_litter_super(sb); + put_net(net); } static struct file_system_type rpc_pipe_fs_type = { diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index c89626b2afff..696381a51634 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -977,8 +977,8 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) goto out_fail; - dprintk("RPC: %5u RPCB_%s reply: %s\n", req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name, (char *)p); + dprintk("RPC: %5u RPCB_%s reply: %*pE\n", req->rq_task->tk_pid, + req->rq_task->tk_msg.rpc_proc->p_name, len, (char *)p); if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len, sap, sizeof(address)) == 0) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 3eed71a2ff2b..737556204566 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -473,10 +473,20 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q struct rpc_task *task; /* + * Service the privileged queue. + */ + q = &queue->tasks[RPC_NR_PRIORITY - 1]; + if (queue->maxpriority > RPC_PRIORITY_PRIVILEGED && !list_empty(q)) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; + } + + /* * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q) && --queue->nr) { + if (!list_empty(q) && queue->nr) { + queue->nr--; task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto out; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 41f6e964fe91..1a7d930a867c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -50,7 +50,7 @@ EXPORT_SYMBOL_GPL(svc_pool_map); static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int -param_set_pool_mode(const char *val, struct kernel_param *kp) +param_set_pool_mode(const char *val, const struct kernel_param *kp) { int *ip = (int *)kp->arg; struct svc_pool_map *m = &svc_pool_map; @@ -80,7 +80,7 @@ out: } static int -param_get_pool_mode(char *buf, struct kernel_param *kp) +param_get_pool_mode(char *buf, const struct kernel_param *kp) { int *ip = (int *)kp->arg; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 2b8e80c721db..7629982040c4 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -97,8 +97,17 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) } EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); -/* - * Format the transport list for printing +/** + * svc_print_xprts - Format the transport list for printing + * @buf: target buffer for formatted address + * @maxlen: length of target buffer + * + * Fills in @buf with a string containing a list of transport names, each name + * terminated with '\n'. If the buffer is too small, some entries may be + * missing, but it is guaranteed that all lines in the output buffer are + * complete. + * + * Returns positive length of the filled-in string. */ int svc_print_xprts(char *buf, int maxlen) { @@ -111,9 +120,9 @@ int svc_print_xprts(char *buf, int maxlen) list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { int slen; - sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); - slen = strlen(tmpstr); - if (len + slen > maxlen) + slen = snprintf(tmpstr, sizeof(tmpstr), "%s %d\n", + xcl->xcl_name, xcl->xcl_max_payload); + if (slen >= sizeof(tmpstr) || len + slen >= maxlen) break; len += slen; strcat(buf, tmpstr); @@ -1002,7 +1011,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st struct svc_xprt *xprt; int ret = 0; - spin_lock(&serv->sv_lock); + spin_lock_bh(&serv->sv_lock); list_for_each_entry(xprt, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; @@ -1010,7 +1019,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); } - spin_unlock(&serv->sv_lock); + spin_unlock_bh(&serv->sv_lock); return ret; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index ed9bbd383f7d..df7ecf9584f6 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1031,6 +1031,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->head[0].iov_len; + subbuf->head[0].iov_base = buf->head[0].iov_base; subbuf->head[0].iov_len = 0; } @@ -1043,6 +1044,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->page_len; + subbuf->pages = buf->pages; + subbuf->page_base = 0; subbuf->page_len = 0; } @@ -1054,6 +1057,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->tail[0].iov_len; + subbuf->tail[0].iov_base = buf->tail[0].iov_base; subbuf->tail[0].iov_len = 0; } diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcb44f69e53..ddd70aec4d88 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, size_t size; req = rpcrdma_create_req(r_xprt); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); req->rl_backchannel = true; size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); @@ -84,25 +84,13 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; - struct rpcrdma_rep *rep; - unsigned long flags; int rc = 0; while (count--) { - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - pr_err("RPC: %s: reply buffer alloc failed\n", - __func__); - rc = PTR_ERR(rep); + rc = rpcrdma_create_rep(r_xprt); + if (rc) break; - } - - spin_lock_irqsave(&buffers->rb_lock, flags); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock_irqrestore(&buffers->rb_lock, flags); } - return rc; } @@ -341,6 +329,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, rqst->rq_reply_bytes_recvd = 0; rqst->rq_bytes_sent = 0; rqst->rq_xid = headerp->rm_xid; + + rqst->rq_private_buf.len = size; set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); buf = &rqst->rq_rcv_buf; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c545f7d7525..740bddcf3488 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -576,6 +576,9 @@ xprt_rdma_free(void *buffer) rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); req = rb->rg_owner; + if (req->rl_backchannel) + return; + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index eadd1655145a..b6879a1986a7 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -911,10 +911,17 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) return req; } -struct rpcrdma_rep * -rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_create_rep - Allocate an rpcrdma_rep object + * @r_xprt: controlling transport + * + * Returns 0 on success or a negative errno on failure. + */ +int + rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_rep *rep; int rc; @@ -934,12 +941,18 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_device = ia->ri_device; rep->rr_rxprt = r_xprt; INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); - return rep; + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_list, &buf->rb_recv_bufs); + spin_unlock(&buf->rb_lock); + return 0; out_free: kfree(rep); out: - return ERR_PTR(rc); + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, rc); + return rc; } int @@ -975,17 +988,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + 2; i++) { - struct rpcrdma_rep *rep; - - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - dprintk("RPC: %s: reply buffer %d alloc failed\n", - __func__, i); - rc = PTR_ERR(rep); + for (i = 0; i <= buf->rb_max_requests; i++) { + rc = rpcrdma_create_rep(r_xprt); + if (rc) goto out; - } - list_add(&rep->rr_list, &buf->rb_recv_bufs); } return 0; @@ -1337,15 +1343,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; - unsigned long flags; int rc; while (count--) { - spin_lock_irqsave(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_recv_bufs)) goto out_reqbuf; rep = rpcrdma_buffer_get_rep_locked(buffers); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) @@ -1355,7 +1360,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) return 0; out_reqbuf: - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); pr_warn("%s: no extra receive buffers\n", __func__); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ac7f8d4f632a..36ec6a602665 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -431,8 +431,8 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); -struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *); +int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); diff --git a/net/tipc/core.c b/net/tipc/core.c index 35b376f58f21..758e59a20a6c 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -88,6 +88,11 @@ out_sk_rht: static void __net_exit tipc_exit_net(struct net *net) { tipc_net_stop(net); + + /* Make sure the tipc_net_finalize_work stopped + * before releasing the resources. + */ + flush_scheduled_work(); tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); @@ -117,14 +122,6 @@ static int __init tipc_init(void) TIPC_CRITICAL_IMPORTANCE; sysctl_tipc_rmem[2] = TIPC_CONN_OVERLOAD_LIMIT; - err = tipc_netlink_start(); - if (err) - goto out_netlink; - - err = tipc_netlink_compat_start(); - if (err) - goto out_netlink_compat; - err = tipc_register_sysctl(); if (err) goto out_sysctl; @@ -145,8 +142,21 @@ static int __init tipc_init(void) if (err) goto out_bearer; + err = tipc_netlink_start(); + if (err) + goto out_netlink; + + err = tipc_netlink_compat_start(); + if (err) + goto out_netlink_compat; + pr_info("Started in single node mode\n"); return 0; + +out_netlink_compat: + tipc_netlink_stop(); +out_netlink: + tipc_bearer_cleanup(); out_bearer: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: @@ -156,22 +166,18 @@ out_socket: out_pernet: tipc_unregister_sysctl(); out_sysctl: - tipc_netlink_compat_stop(); -out_netlink_compat: - tipc_netlink_stop(); -out_netlink: pr_err("Unable to start in single node mode\n"); return err; } static void __exit tipc_exit(void) { + tipc_netlink_compat_stop(); + tipc_netlink_stop(); tipc_bearer_cleanup(); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); - tipc_netlink_stop(); - tipc_netlink_compat_stop(); tipc_unregister_sysctl(); pr_info("Deactivated\n"); diff --git a/net/tipc/link.c b/net/tipc/link.c index 736fffb28ab6..0080699b7cd1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -883,7 +883,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, default: pr_warn("Dropping received illegal msg type\n"); kfree_skb(skb); - return false; + return true; }; } diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 67bddcb2ff46..6bac0e6e4643 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -138,18 +138,14 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - if (unlikely(skb_unclone(frag, GFP_ATOMIC))) + *buf = NULL; + if (skb_has_frag_list(frag) && __skb_linearize(frag)) + goto err; + frag = skb_unshare(frag, GFP_ATOMIC); + if (unlikely(!frag)) goto err; head = *headbuf = frag; - *buf = NULL; TIPC_SKB_CB(head)->tail = NULL; - if (skb_is_nonlinear(head)) { - skb_walk_frags(head, tail) { - TIPC_SKB_CB(head)->tail = tail; - } - } else { - skb_frag_list_init(head); - } return 0; } diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index d2bf92e71150..fb1b5dcf0142 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -250,8 +250,9 @@ err_out: static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg) { - int err; + struct nlmsghdr *nlh; struct sk_buff *arg; + int err; if (msg->req_type && (!msg->req_size || !TLV_CHECK_TYPE(msg->req, msg->req_type))) @@ -280,6 +281,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } + nlh = nlmsg_put(arg, 0, 0, tipc_genl_family.id, 0, NLM_F_MULTI); + if (!nlh) { + kfree_skb(arg); + kfree_skb(msg->rep); + msg->rep = NULL; + return -EMSGSIZE; + } + nlmsg_end(arg, nlh); + err = __tipc_nl_compat_dumpit(cmd, msg, arg); if (err) { kfree_skb(msg->rep); @@ -516,7 +526,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, if (len <= 0) return -EINVAL; - len = min_t(int, len, TIPC_MAX_BEARER_NAME); + len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_valid(name, len)) return -EINVAL; @@ -622,7 +632,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL); - link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); + link_info.dest = htonl(nla_get_flag(link[TIPC_NLA_LINK_DEST])); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME], TIPC_MAX_LINK_NAME); @@ -791,7 +801,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, if (len <= 0) return -EINVAL; - len = min_t(int, len, TIPC_MAX_BEARER_NAME); + len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_valid(name, len)) return -EINVAL; @@ -926,6 +936,10 @@ static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock) hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); + if (!hdr) { + kfree_skb(args); + return -EMSGSIZE; + } nest = nla_nest_start(args, TIPC_NLA_SOCK); if (!nest) { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 65171f8e8c45..9d380d55ea1c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -763,6 +763,9 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { skb_queue_splice_tail_init(&tmpq, inputq); + /* Decrease the skb's refcnt as increasing in the + * function tipc_skb_peek + */ kfree_skb(__skb_dequeue(arrvq)); } spin_unlock_bh(&inputq->lock); @@ -1754,7 +1757,7 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, u32 dport, struct sk_buff_head *xmitq) { - unsigned long time_limit = jiffies + 2; + unsigned long time_limit = jiffies + usecs_to_jiffies(20000); struct sk_buff *skb; unsigned int lim; atomic_t *dcnt; @@ -1984,7 +1987,7 @@ static int tipc_listen(struct socket *sock, int len) static int tipc_wait_for_accept(struct socket *sock, long timeo) { struct sock *sk = sock->sk; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int err; /* True wake-one mechanism for incoming connections: only @@ -1993,12 +1996,12 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * anymore, the common case will execute the loop only once. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { + add_wait_queue(sk_sleep(sk), &wait); release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); + remove_wait_queue(sk_sleep(sk), &wait); } err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -2013,7 +2016,6 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) if (signal_pending(current)) break; } - finish_wait(sk_sleep(sk), &wait); return err; } diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 1a779b1e8510..40f6d82083d7 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -37,6 +37,8 @@ #include <linux/sysctl.h> +static int zero; +static int one = 1; static struct ctl_table_header *tipc_ctl_hdr; static struct ctl_table tipc_table[] = { @@ -45,14 +47,16 @@ static struct ctl_table tipc_table[] = { .data = &sysctl_tipc_rmem, .maxlen = sizeof(sysctl_tipc_rmem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "named_timeout", .data = &sysctl_tipc_named_timeout, .maxlen = sizeof(sysctl_tipc_named_timeout), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, {} }; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 78d6b78de29d..90d54ad86e4e 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -200,14 +200,17 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; - err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst, - &fl6); - if (err) + ndst = ipv6_stub->ipv6_dst_lookup_flow(net, + ub->ubsock->sk, + &fl6, NULL); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); goto tx_error; + } ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, src->udp_port, + &dst->ipv6, 0, ttl, 0, src->udp_port, dst->udp_port, false); #endif } @@ -405,10 +408,13 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); - if (enable_mcast(ub, remote)) + err = enable_mcast(ub, remote); + if (err) goto err; return 0; err: + if (ub->ubsock) + udp_tunnel_sock_release(ub->ubsock); kfree(ub); return err; } diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 8b31ab85d050..3b9e450656a4 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -19,6 +19,11 @@ config UNIX Say Y unless you know what you are doing. +config UNIX_SCM + bool + depends on UNIX + default y + config UNIX_DIAG tristate "UNIX: socket monitoring interface" depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index b663c607b1c6..dc686c6757fb 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -9,3 +9,5 @@ unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o + +obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4f547eebfd36..2c09eadfa90c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,6 +118,8 @@ #include <linux/security.h> #include <linux/freezer.h> +#include "scm.h" + struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); DEFINE_SPINLOCK(unix_table_lock); @@ -191,11 +193,17 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk) return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } -static inline int unix_recvq_full(struct sock const *sk) +static inline int unix_recvq_full(const struct sock *sk) { return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } +static inline int unix_recvq_full_lockless(const struct sock *sk) +{ + return skb_queue_len_lockless(&sk->sk_receive_queue) > + READ_ONCE(sk->sk_max_ack_backlog); +} + struct sock *unix_peer_get(struct sock *s) { struct sock *peer; @@ -528,12 +536,14 @@ static void unix_release_sock(struct sock *sk, int embrion) u->path.mnt = NULL; state = sk->sk_state; sk->sk_state = TCP_CLOSE; + + skpair = unix_peer(sk); + unix_peer(sk) = NULL; + unix_state_unlock(sk); wake_up_interruptible_all(&u->peer_wait); - skpair = unix_peer(sk); - if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); @@ -548,7 +558,6 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ - unix_peer(sk) = NULL; } /* Try to flush out this socket. Throw out buffers at least */ @@ -585,20 +594,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) @@ -1497,78 +1528,51 @@ out: return err; } -static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} - -static void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -#define MAX_RECURSION_LEVEL 4 - -static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - unsigned char max_level = 0; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - for (i = scm->fp->count - 1; i >= 0; i--) { - struct sock *sk = unix_get_socket(scm->fp->fp[i]); - - if (sk) - max_level = max(max_level, - unix_sk(sk)->recursion_level); - } - if (unlikely(max_level > MAX_RECURSION_LEVEL)) - return -ETOOMANYREFS; + scm->fp = scm_fp_dup(UNIXCB(skb).fp); /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. + * Garbage collection of unix sockets starts by selecting a set of + * candidate sockets which have reference only from being in flight + * (total_refs == inflight_refs). This condition is checked once during + * the candidate collection phase, and candidates are marked as such, so + * that non-candidates can later be ignored. While inflight_refs is + * protected by unix_gc_lock, total_refs (file count) is not, hence this + * is an instantaneous decision. + * + * Once a candidate, however, the socket must not be reinstalled into a + * file descriptor while the garbage collection is in progress. + * + * If the above conditions are met, then the directed graph of + * candidates (*) does not change while unix_gc_lock is held. + * + * Any operations that changes the file count through file descriptors + * (dup, close, sendmsg) does not change the graph since candidates are + * not installed in fds. + * + * Dequeing a candidate via recvmsg would install it into an fd, but + * that takes unix_gc_lock to decrement the inflight count, so it's + * serialized with garbage collection. + * + * MSG_PEEK is special in that it does not change the inflight count, + * yet does install the socket into an fd. The following lock/unlock + * pair is to ensure serialization with garbage collection. It must be + * done between incrementing the file count and installing the file into + * an fd. + * + * If garbage collection starts after the barrier provided by the + * lock/unlock, then it will see the elevated refcount and not mark this + * as a candidate. If a garbage collection is already in progress + * before the file count was incremented, then the lock/unlock pair will + * ensure that garbage collection is finished before progressing to + * installing the fd. + * + * (*) A -> B where B is on the queue of A or B is on the queue of C + * which is on the queue of listening socket A. */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return max_level; + spin_lock(&unix_gc_lock); + spin_unlock(&unix_gc_lock); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1792,7 +1796,8 @@ restart_locked: * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && - unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + unlikely(unix_peer(other) != sk && + unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); @@ -2195,7 +2200,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; @@ -2440,7 +2445,7 @@ unlock: /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); @@ -2717,7 +2722,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer(sk); if (other && unix_peer(other) != sk && - unix_recvq_full(other) && + unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c36757e72844..4d283e26d816 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -86,77 +86,13 @@ #include <net/scm.h> #include <net/tcp_states.h> +#include "scm.h" + /* Internal data structures and random procedures: */ -static LIST_HEAD(gc_inflight_list); static LIST_HEAD(gc_candidates); -static DEFINE_SPINLOCK(unix_gc_lock); static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); -unsigned int unix_tot_inflight; - -struct sock *unix_get_socket(struct file *filp) -{ - struct sock *u_sock = NULL; - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && sock->ops && sock->ops->family == PF_UNIX) - u_sock = s; - } - return u_sock; -} - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ - -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - if (atomic_long_inc_return(&u->inflight) == 1) { - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - BUG_ON(list_empty(&u->link)); - } - unix_tot_inflight++; - } - user->unix_inflight++; - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - BUG_ON(!atomic_long_read(&u->inflight)); - BUG_ON(list_empty(&u->link)); - - if (atomic_long_dec_and_test(&u->inflight)) - list_del_init(&u->link); - unix_tot_inflight--; - } - user->unix_inflight--; - spin_unlock(&unix_gc_lock); -} - static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { @@ -261,8 +197,11 @@ void wait_for_unix_gc(void) { /* If number of inflight sockets is insane, * force a garbage collect right now. + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight() and gc_in_progress(). */ - if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } @@ -282,7 +221,9 @@ void unix_gc(void) if (gc_in_progress) goto out; - gc_in_progress = true; + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, true); + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -368,7 +309,10 @@ void unix_gc(void) /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); - gc_in_progress = false; + + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); + wake_up(&unix_gc_wait); out: diff --git a/net/unix/scm.c b/net/unix/scm.c new file mode 100644 index 000000000000..bf1a8fa8c4f1 --- /dev/null +++ b/net/unix/scm.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/fs.h> +#include <net/af_unix.h> +#include <net/scm.h> +#include <linux/init.h> + +#include "scm.h" + +unsigned int unix_tot_inflight; +EXPORT_SYMBOL(unix_tot_inflight); + +LIST_HEAD(gc_inflight_list); +EXPORT_SYMBOL(gc_inflight_list); + +DEFINE_SPINLOCK(unix_gc_lock); +EXPORT_SYMBOL(unix_gc_lock); + +struct sock *unix_get_socket(struct file *filp) +{ + struct sock *u_sock = NULL; + struct inode *inode = file_inode(filp); + + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + struct sock *s = sock->sk; + + /* PF_UNIX ? */ + if (s && sock->ops && sock->ops->family == PF_UNIX) + u_sock = s; + } + return u_sock; +} +EXPORT_SYMBOL(unix_get_socket); + +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ +void unix_inflight(struct user_struct *user, struct file *fp) +{ + struct sock *s = unix_get_socket(fp); + + spin_lock(&unix_gc_lock); + + if (s) { + struct unix_sock *u = unix_sk(s); + + if (atomic_long_inc_return(&u->inflight) == 1) { + BUG_ON(!list_empty(&u->link)); + list_add_tail(&u->link, &gc_inflight_list); + } else { + BUG_ON(list_empty(&u->link)); + } + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); + } + user->unix_inflight++; + spin_unlock(&unix_gc_lock); +} + +void unix_notinflight(struct user_struct *user, struct file *fp) +{ + struct sock *s = unix_get_socket(fp); + + spin_lock(&unix_gc_lock); + + if (s) { + struct unix_sock *u = unix_sk(s); + + BUG_ON(!atomic_long_read(&u->inflight)); + BUG_ON(list_empty(&u->link)); + + if (atomic_long_dec_and_test(&u->inflight)) + list_del_init(&u->link); + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); + } + user->unix_inflight--; + spin_unlock(&unix_gc_lock); +} + +/* + * The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +#define MAX_RECURSION_LEVEL 4 + +int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + unsigned char max_level = 0; + + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + for (i = scm->fp->count - 1; i >= 0; i--) { + struct sock *sk = unix_get_socket(scm->fp->fp[i]); + + if (sk) + max_level = max(max_level, + unix_sk(sk)->recursion_level); + } + if (unlikely(max_level > MAX_RECURSION_LEVEL)) + return -ETOOMANYREFS; + + /* + * Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->user, scm->fp->fp[i]); + return max_level; +} +EXPORT_SYMBOL(unix_attach_fds); + +void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + for (i = scm->fp->count-1; i >= 0; i--) + unix_notinflight(scm->fp->user, scm->fp->fp[i]); +} +EXPORT_SYMBOL(unix_detach_fds); + +void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} +EXPORT_SYMBOL(unix_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h new file mode 100644 index 000000000000..5a255a477f16 --- /dev/null +++ b/net/unix/scm.h @@ -0,0 +1,10 @@ +#ifndef NET_UNIX_SCM_H +#define NET_UNIX_SCM_H + +extern struct list_head gc_inflight_list; +extern spinlock_t unix_gc_lock; + +int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); +void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); + +#endif diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7566395e526d..baab5f65fbeb 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -97,6 +97,7 @@ #include <linux/mutex.h> #include <linux/net.h> #include <linux/poll.h> +#include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> @@ -501,9 +502,13 @@ out: static int __vsock_bind_stream(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - static u32 port = LAST_RESERVED_PORT + 1; + static u32 port = 0; struct sockaddr_vm new_addr; + if (!port) + port = LAST_RESERVED_PORT + 1 + + prandom_u32_max(U32_MAX - LAST_RESERVED_PORT); + vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); if (addr->svm_port == VMADDR_PORT_ANY) { @@ -645,8 +650,9 @@ struct sock *__vsock_create(struct net *net, vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; + security_sk_clone(parent, sk); } else { - vsk->trusted = capable(CAP_NET_ADMIN); + vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; } @@ -825,10 +831,12 @@ static int vsock_shutdown(struct socket *sock, int mode) */ sk = sock->sk; + + lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sk->sk_type == SOCK_STREAM) - return err; + goto out; } else { sock->state = SS_DISCONNECTING; err = 0; @@ -837,10 +845,8 @@ static int vsock_shutdown(struct socket *sock, int mode) /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { - lock_sock(sk); sk->sk_shutdown |= mode; sk->sk_state_change(sk); - release_sock(sk); if (sk->sk_type == SOCK_STREAM) { sock_reset_flag(sk, SOCK_DONE); @@ -848,6 +854,8 @@ static int vsock_shutdown(struct socket *sock, int mode) } } +out: + release_sock(sk); return err; } @@ -1168,6 +1176,8 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, * non-blocking call. */ err = -EALREADY; + if (flags & O_NONBLOCK) + goto out; break; default: if ((sk->sk_state == VSOCK_SS_LISTEN) || @@ -1291,7 +1301,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ - timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); + timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK); prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); while ((connected = vsock_dequeue_accept(listener)) == NULL && diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 9c07c76c504d..ddcae46ae408 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -25,6 +25,10 @@ /* How long to wait for graceful shutdown of a connection */ #define VSOCK_CLOSE_TIMEOUT (8 * HZ) +uint virtio_transport_max_vsock_pkt_buf_size = 64 * 1024; +module_param(virtio_transport_max_vsock_pkt_buf_size, uint, 0444); +EXPORT_SYMBOL_GPL(virtio_transport_max_vsock_pkt_buf_size); + static const struct virtio_transport *virtio_transport_get_ops(void) { const struct vsock_transport *t = vsock_core_get_transport(); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 102bf9194662..c09efcdf72d2 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -593,8 +593,7 @@ vmci_transport_queue_pair_alloc(struct vmci_qp **qpair, peer, flags, VMCI_NO_PRIVILEGE_FLAGS); out: if (err < 0) { - pr_err("Could not attach to queue pair with %d\n", - err); + pr_err_once("Could not attach to queue pair with %d\n", err); err = vmci_transport_error_to_vsock_error(err); } diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 6b5467ea99db..1efc3f14224c 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -877,7 +877,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, if (chan == other_chan) return true; - if (chan->band != IEEE80211_BAND_5GHZ) + if (chan->band != NL80211_BAND_5GHZ) continue; r1 = cfg80211_get_unii(chan->center_freq); diff --git a/net/wireless/core.c b/net/wireless/core.c index eafc34b27013..32870deb8464 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -559,7 +559,7 @@ int wiphy_register(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); int res; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *sband; bool have_band = false; int i; @@ -649,7 +649,7 @@ int wiphy_register(struct wiphy *wiphy) return res; /* sanity check supported bands/channels */ - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; @@ -661,7 +661,7 @@ int wiphy_register(struct wiphy *wiphy) * on 60GHz band, there are no legacy rates, so * n_bitrates is 0 */ - if (WARN_ON(band != IEEE80211_BAND_60GHZ && + if (WARN_ON(band != NL80211_BAND_60GHZ && !sband->n_bitrates)) return -EINVAL; @@ -671,7 +671,7 @@ int wiphy_register(struct wiphy *wiphy) * global structure for that. */ if (cfg80211_disable_40mhz_24ghz && - band == IEEE80211_BAND_2GHZ && + band == NL80211_BAND_2GHZ && sband->ht_cap.ht_supported) { sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40; diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 454157717efa..5d453916a417 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -69,7 +69,7 @@ static ssize_t ht40allow_map_read(struct file *file, struct wiphy *wiphy = file->private_data; char *buf; unsigned int offset = 0, buf_size = PAGE_SIZE, i, r; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *sband; buf = kzalloc(buf_size, GFP_KERNEL); @@ -78,7 +78,7 @@ static ssize_t ht40allow_map_read(struct file *file, rtnl_lock(); - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index e9e91298c70d..3cedf2c2b60b 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -6,9 +6,13 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct device *pdev = wiphy_dev(wdev->wiphy); - strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name, - sizeof(info->driver)); + if (pdev->driver) + strlcpy(info->driver, pdev->driver->name, + sizeof(info->driver)); + else + strlcpy(info->driver, "N/A", sizeof(info->driver)); strlcpy(info->version, init_utsname()->release, sizeof(info->version)); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 7da0cd9c5e73..398fa066d249 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -104,7 +104,7 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct ieee80211_supported_band *sband = rdev->wiphy.bands[params->chandef.chan->band]; int j; - u32 flag = params->chandef.chan->band == IEEE80211_BAND_5GHZ ? + u32 flag = params->chandef.chan->band == NL80211_BAND_5GHZ ? IEEE80211_RATE_MANDATORY_A : IEEE80211_RATE_MANDATORY_B; @@ -114,6 +114,9 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, } } + if (WARN_ON(connkeys && connkeys->def < 0)) + return -EINVAL; + if (WARN_ON(wdev->connect_keys)) kzfree(wdev->connect_keys); wdev->connect_keys = connkeys; @@ -237,7 +240,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; - enum ieee80211_band band; + enum nl80211_band band; int i, err; ASSERT_WDEV_LOCK(wdev); @@ -249,7 +252,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, if (!wdev->wext.ibss.chandef.chan) { struct ieee80211_channel *new_chan = NULL; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; @@ -292,7 +295,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, wdev->wext.ibss.privacy = wdev->wext.default_key != -1; - if (wdev->wext.keys) { + if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index dda90a39be40..60501476fcd0 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -128,9 +128,9 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, if (!setup->chandef.chan) { /* if we don't have that either, use the first usable channel */ - enum ieee80211_band band; + enum nl80211_band band; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; int i; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 22536248bf67..7dd4f49b1cc0 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -774,7 +774,7 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) wiphy = &rdev->wiphy; rtnl_lock(); - for (bandid = 0; bandid < IEEE80211_NUM_BANDS; bandid++) { + for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) { sband = wiphy->bands[bandid]; if (!sband) continue; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index faf2a41282f9..4b61c19a7eb0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -330,6 +330,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, + [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, @@ -403,6 +404,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MDID] = { .type = NLA_U16 }, [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 }, + [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = { .type = NLA_U16 }, [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 }, [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, @@ -428,6 +431,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, + [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 }, [NL80211_ATTR_MAC_MASK] = { .len = ETH_ALEN }, [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG }, [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, @@ -885,6 +889,15 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, struct nlattr *key; struct cfg80211_cached_keys *result; int rem, err, def = 0; + bool have_key = false; + + nla_for_each_nested(key, keys, rem) { + have_key = true; + break; + } + + if (!have_key) + return NULL; result = kzalloc(sizeof(*result), GFP_KERNEL); if (!result) @@ -930,6 +943,11 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, } } + if (result->def < 0) { + err = -EINVAL; + goto error; + } + return result; error: kfree(result); @@ -1364,7 +1382,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, struct nlattr *nl_bands, *nl_band; struct nlattr *nl_freqs, *nl_freq; struct nlattr *nl_cmds; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_channel *chan; int i; const struct ieee80211_txrx_stypes *mgmt_stypes = @@ -1497,7 +1515,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, goto nla_put_failure; for (band = state->band_start; - band < IEEE80211_NUM_BANDS; band++) { + band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = rdev->wiphy.bands[band]; @@ -1559,7 +1577,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, } nla_nest_end(msg, nl_bands); - if (band < IEEE80211_NUM_BANDS) + if (band < NUM_NL80211_BANDS) state->band_start = band + 1; else state->band_start = 0; @@ -1745,7 +1763,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, * case we'll continue with more data in the next round, * but break unconditionally so unsplit data stops here. */ - state->split_start++; + if (state->split) + state->split_start++; + else + state->split_start = 0; break; case 9: if (rdev->wiphy.extended_capabilities && @@ -3241,6 +3262,9 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (key.idx < 0) + return -EINVAL; + if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); @@ -3509,7 +3533,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, memset(mask, 0, sizeof(*mask)); /* Default to all rates enabled */ - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = rdev->wiphy.bands[i]; if (!sband) @@ -3536,10 +3560,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, */ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { - enum ieee80211_band band = nla_type(tx_rates); + enum nl80211_band band = nla_type(tx_rates); int err; - if (band < 0 || band >= IEEE80211_NUM_BANDS) + if (band < 0 || band >= NUM_NL80211_BANDS) return -EINVAL; sband = rdev->wiphy.bands[band]; if (sband == NULL) @@ -3924,10 +3948,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = validate_beacon_tx_rate( - rdev, - (enum nl80211_band)(params.chandef.chan->band), - ¶ms.beacon_rate); + err = validate_beacon_tx_rate(rdev, params.chandef.chan->band, + ¶ms.beacon_rate); if (err) return err; } @@ -3956,7 +3978,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) + if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) return -EOPNOTSUPP; if (info->attrs[NL80211_ATTR_ACL_POLICY]) { @@ -5864,7 +5886,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct mesh_config cfg; + struct mesh_config cfg = {}; u32 mask; int err; @@ -6256,9 +6278,9 @@ static int validate_scan_freqs(struct nlattr *freqs) return n_channels; } -static bool is_band_valid(struct wiphy *wiphy, enum ieee80211_band b) +static bool is_band_valid(struct wiphy *wiphy, enum nl80211_band b) { - return b < IEEE80211_NUM_BANDS && wiphy->bands[b]; + return b < NUM_NL80211_BANDS && wiphy->bands[b]; } static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy, @@ -6309,10 +6331,7 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy, bss_select->behaviour = NL80211_BSS_SELECT_ATTR_RSSI_ADJUST; bss_select->param.adjust.band = adj_param->band; bss_select->param.adjust.delta = adj_param->delta; - if (!is_band_valid( - wiphy, - ((enum ieee80211_band)(bss_select->param.adjust.band)) - )) + if (!is_band_valid(wiphy, bss_select->param.adjust.band)) return -EINVAL; } @@ -6456,10 +6475,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) i++; } } else { - enum ieee80211_band band; + enum nl80211_band band; /* all channels */ - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) continue; @@ -6504,7 +6523,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->ie_len); } - for (i = 0; i < IEEE80211_NUM_BANDS; i++) + for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) request->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; @@ -6513,9 +6532,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SUPP_RATES], tmp) { - enum ieee80211_band band = nla_type(attr); + enum nl80211_band band = nla_type(attr); - if (band < 0 || band >= IEEE80211_NUM_BANDS) { + if (band < 0 || band >= NUM_NL80211_BANDS) { err = -EINVAL; goto out_free; } @@ -6698,7 +6717,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_sched_scan_request *request; struct nlattr *attr; int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i, n_plans = 0; - enum ieee80211_band band; + enum nl80211_band band; size_t ie_len; struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1]; s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF; @@ -6869,7 +6888,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, } } else { /* all channels */ - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) continue; @@ -7021,10 +7040,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]); request->rssi_adjust.band = rssi_adjust->band; request->rssi_adjust.delta = rssi_adjust->delta; - if (!is_band_valid( - wiphy, - (enum ieee80211_band)(request->rssi_adjust.band) - )) { + if (!is_band_valid(wiphy, request->rssi_adjust.band)) { err = -EINVAL; goto out_free; } @@ -8051,14 +8067,14 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) static bool nl80211_parse_mcast_rate(struct cfg80211_registered_device *rdev, - int mcast_rate[IEEE80211_NUM_BANDS], + int mcast_rate[NUM_NL80211_BANDS], int rateval) { struct wiphy *wiphy = &rdev->wiphy; bool found = false; int band, i; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = wiphy->bands[band]; @@ -8240,7 +8256,7 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; - int mcast_rate[IEEE80211_NUM_BANDS]; + int mcast_rate[NUM_NL80211_BANDS]; u32 nla_rate; int err; @@ -8643,7 +8659,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) { + if (connect.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) { kzfree(connkeys); return -EOPNOTSUPP; } @@ -9549,10 +9565,8 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = validate_beacon_tx_rate( - rdev, - (enum nl80211_band)(setup.chandef.chan->band), - &setup.beacon_rate); + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, + &setup.beacon_rate); if (err) return err; } @@ -10443,7 +10457,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct nlattr *tb[NUM_NL80211_REKEY_DATA]; - struct cfg80211_gtk_rekey_data rekey_data; + struct cfg80211_gtk_rekey_data rekey_data = {}; int err; if (!info->attrs[NL80211_ATTR_REKEY_DATA]) @@ -10802,13 +10816,13 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) if (!wdev->netdev && !wdev->p2p_started) return -ENETDOWN; } - - if (!vcmd->doit) - return -EOPNOTSUPP; } else { wdev = NULL; } + if (!vcmd->doit) + return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]); len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]); @@ -11215,7 +11229,7 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb, * section 10.22.6.2.1. Disallow 5/10Mhz channels as well for now, the * specification is not defined for them. */ - if (chandef.chan->band == IEEE80211_BAND_2GHZ && + if (chandef.chan->band == NL80211_BAND_2GHZ && chandef.width != NL80211_CHAN_WIDTH_20_NOHT && chandef.width != NL80211_CHAN_WIDTH_20) return -EINVAL; @@ -12802,7 +12816,8 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, } void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, - const u8* ie, u8 ie_len, gfp_t gfp) + const u8 *ie, u8 ie_len, + int sig_dbm, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -12828,7 +12843,9 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || (ie_len && ie && - nla_put(msg, NL80211_ATTR_IE, ie_len , ie))) + nla_put(msg, NL80211_ATTR_IE, ie_len, ie)) || + (sig_dbm && + nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -13575,7 +13592,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->chandef = *chandef; wdev->preset_chandef = *chandef; - if (wdev->iftype == NL80211_IFTYPE_STATION && + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && !WARN_ON(!wdev->current_bss)) wdev->current_bss->pub.channel = chandef->chan; diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index c20e37acdc80..a7afd4be29e8 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -537,6 +537,10 @@ static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) { int ret; + + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; + trace_rdev_set_wiphy_params(&rdev->wiphy, changed); ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); trace_rdev_return_int(&rdev->wiphy, ret); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 647591c5dadd..98a5959b52a5 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1591,12 +1591,12 @@ static void reg_process_ht_flags_band(struct wiphy *wiphy, static void reg_process_ht_flags(struct wiphy *wiphy) { - enum ieee80211_band band; + enum nl80211_band band; if (!wiphy) return; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) reg_process_ht_flags_band(wiphy, wiphy->bands[band]); } @@ -1754,7 +1754,7 @@ static void reg_check_channels(void) static void wiphy_update_regulatory(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { - enum ieee80211_band band; + enum nl80211_band band; struct regulatory_request *lr = get_last_request(); if (ignore_reg_update(wiphy, initiator)) { @@ -1773,7 +1773,7 @@ static void wiphy_update_regulatory(struct wiphy *wiphy, lr->dfs_region = get_cfg80211_regdom()->dfs_region; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band(wiphy, initiator, wiphy->bands[band]); reg_process_beacons(wiphy); @@ -1898,14 +1898,14 @@ static void handle_band_custom(struct wiphy *wiphy, void wiphy_apply_custom_regulatory(struct wiphy *wiphy, const struct ieee80211_regdomain *regd) { - enum ieee80211_band band; + enum nl80211_band band; unsigned int bands_set = 0; WARN(!(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG), "wiphy should have REGULATORY_CUSTOM_REG\n"); wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!wiphy->bands[band]) continue; handle_band_custom(wiphy, wiphy->bands[band], regd); @@ -2337,7 +2337,7 @@ static void reg_process_self_managed_hints(void) struct wiphy *wiphy; const struct ieee80211_regdomain *tmp; const struct ieee80211_regdomain *regd; - enum ieee80211_band band; + enum nl80211_band band; struct regulatory_request request = {}; list_for_each_entry(rdev, &cfg80211_rdev_list, list) { @@ -2355,7 +2355,7 @@ static void reg_process_self_managed_hints(void) rcu_assign_pointer(wiphy->regd, regd); rcu_free_regdom(tmp); - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band_custom(wiphy, wiphy->bands[band], regd); reg_process_ht_flags(wiphy); @@ -2423,6 +2423,9 @@ int regulatory_hint_user(const char *alpha2, if (WARN_ON(!alpha2)) return -EINVAL; + if (!is_world_regdom(alpha2) && !is_an_alpha2(alpha2)) + return -EINVAL; + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; @@ -2515,7 +2518,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) } EXPORT_SYMBOL(regulatory_hint); -void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band, +void regulatory_hint_country_ie(struct wiphy *wiphy, enum nl80211_band band, const u8 *country_ie, u8 country_ie_len) { char alpha2[2]; @@ -2615,11 +2618,11 @@ static void restore_alpha2(char *alpha2, bool reset_user) static void restore_custom_reg_settings(struct wiphy *wiphy) { struct ieee80211_supported_band *sband; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_channel *chan; int i; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; @@ -2734,9 +2737,9 @@ void regulatory_hint_disconnect(void) static bool freq_is_chan_12_13_14(u16 freq) { - if (freq == ieee80211_channel_to_frequency(12, IEEE80211_BAND_2GHZ) || - freq == ieee80211_channel_to_frequency(13, IEEE80211_BAND_2GHZ) || - freq == ieee80211_channel_to_frequency(14, IEEE80211_BAND_2GHZ)) + if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) || + freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) || + freq == ieee80211_channel_to_frequency(14, NL80211_BAND_2GHZ)) return true; return false; } @@ -2761,7 +2764,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, if (beacon_chan->beacon_found || beacon_chan->flags & IEEE80211_CHAN_RADAR || - (beacon_chan->band == IEEE80211_BAND_2GHZ && + (beacon_chan->band == NL80211_BAND_2GHZ && !freq_is_chan_12_13_14(beacon_chan->center_freq))) return 0; @@ -2813,7 +2816,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) power_rule = ®_rule->power_rule; if (reg_rule->flags & NL80211_RRF_AUTO_BW) - snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO", + snprintf(bw, sizeof(bw), "%d KHz, %u KHz AUTO", freq_range->max_bandwidth_khz, reg_get_max_bandwidth(rd, reg_rule)); else diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 7bbe2d138d2a..ff078f093989 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -104,7 +104,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, * information for a band the BSS is not present in it will be ignored. */ void regulatory_hint_country_ie(struct wiphy *wiphy, - enum ieee80211_band band, + enum nl80211_band band, const u8 *country_ie, u8 country_ie_len); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a6451bf9a717..afe55f6ebb6e 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -582,7 +582,7 @@ static int cmp_bss(struct cfg80211_bss *a, } static bool cfg80211_bss_type_match(u16 capability, - enum ieee80211_band band, + enum nl80211_band band, enum ieee80211_bss_type bss_type) { bool ret = true; @@ -591,7 +591,7 @@ static bool cfg80211_bss_type_match(u16 capability, if (bss_type == IEEE80211_BSS_TYPE_ANY) return ret; - if (band == IEEE80211_BAND_60GHZ) { + if (band == NL80211_BAND_60GHZ) { mask = WLAN_CAPABILITY_DMG_TYPE_MASK; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: @@ -947,14 +947,14 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, * be grouped with this beacon for updates ... */ if (!cfg80211_combine_bsses(rdev, new)) { - kfree(new); + bss_ref_put(rdev, new); goto drop; } } if (rdev->bss_entries >= bss_entries_limit && !cfg80211_bss_expire_oldest(rdev)) { - kfree(new); + bss_ref_put(rdev, new); goto drop; } @@ -1075,7 +1075,7 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, if (!res) return NULL; - if (channel->band == IEEE80211_BAND_60GHZ) { + if (channel->band == NL80211_BAND_60GHZ) { bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) @@ -1158,7 +1158,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, if (!res) return NULL; - if (channel->band == IEEE80211_BAND_60GHZ) { + if (channel->band == NL80211_BAND_60GHZ) { bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) @@ -1254,7 +1254,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, struct iw_scan_req *wreq = NULL; struct cfg80211_scan_request *creq = NULL; int i, err, n_channels = 0; - enum ieee80211_band band; + enum nl80211_band band; if (!netif_running(dev)) return -ENETDOWN; @@ -1298,7 +1298,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, /* translate "Scan on frequencies" request */ i = 0; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) @@ -1358,7 +1358,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, creq->n_ssids = 0; } - for (i = 0; i < IEEE80211_NUM_BANDS; i++) + for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 5527bbb80f71..593258220735 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -106,7 +106,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) return -ENOMEM; if (wdev->conn->params.channel) { - enum ieee80211_band band = wdev->conn->params.channel->band; + enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = wdev->wiphy->bands[band]; @@ -118,11 +118,11 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) request->rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *bands; struct ieee80211_channel *channel; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { bands = wdev->wiphy->bands[band]; if (!bands) continue; @@ -290,6 +290,15 @@ void cfg80211_conn_work(struct work_struct *work) rtnl_unlock(); } +static void cfg80211_step_auth_next(struct cfg80211_conn *conn, + struct cfg80211_bss *bss) +{ + memcpy(conn->bssid, bss->bssid, ETH_ALEN); + conn->params.bssid = conn->bssid; + conn->params.channel = bss->channel; + conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; +} + /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { @@ -307,10 +316,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) if (!bss) return NULL; - memcpy(wdev->conn->bssid, bss->bssid, ETH_ALEN); - wdev->conn->params.bssid = wdev->conn->bssid; - wdev->conn->params.channel = bss->channel; - wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; + cfg80211_step_auth_next(wdev->conn, bss); schedule_work(&rdev->conn_work); return bss; @@ -546,7 +552,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, if (wdev->current_bss) return -EALREADY; - if (WARN_ON(wdev->conn)) + if (wdev->conn) return -EINPROGRESS; wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL); @@ -584,7 +590,12 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, wdev->conn->params.ssid_len = wdev->ssid_len; /* see if we have the bss already */ - bss = cfg80211_get_conn_bss(wdev); + bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, + wdev->conn->params.bssid, + wdev->conn->params.ssid, + wdev->conn->params.ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (prev_bssid) { memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); @@ -595,6 +606,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, if (bss) { enum nl80211_timeout_reason treason; + cfg80211_step_auth_next(wdev->conn, bss); err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { @@ -1116,6 +1128,18 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, connect->crypto.ciphers_pairwise[0] = cipher; } } + } else { + if (WARN_ON(connkeys)) + return -EINVAL; + + /* connect can point to wdev->wext.connect which + * can hold key data from a previous connection + */ + connect->key = NULL; + connect->key_len = 0; + connect->key_idx = 0; + connect->crypto.cipher_group = 0; + connect->crypto.n_ciphers_pairwise = 0; } wdev->connect_keys = connkeys; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 77eb73d3c83f..48a2df99e7f9 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -110,7 +110,7 @@ conf->dot11MeshHWMPconfirmationInterval; \ } while (0) -#define CHAN_ENTRY __field(enum ieee80211_band, band) \ +#define CHAN_ENTRY __field(enum nl80211_band, band) \ __field(u16, center_freq) #define CHAN_ASSIGN(chan) \ do { \ @@ -125,7 +125,7 @@ #define CHAN_PR_FMT "band: %d, freq: %u" #define CHAN_PR_ARG __entry->band, __entry->center_freq -#define CHAN_DEF_ENTRY __field(enum ieee80211_band, band) \ +#define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \ __field(u32, control_freq) \ __field(u32, width) \ __field(u32, center_freq1) \ @@ -2650,7 +2650,7 @@ TRACE_EVENT(cfg80211_scan_done, TP_STRUCT__entry( __field(u32, n_channels) __dynamic_array(u8, ie, request ? request->ie_len : 0) - __array(u32, rates, IEEE80211_NUM_BANDS) + __array(u32, rates, NUM_NL80211_BANDS) __field(u32, wdev_id) MAC_ENTRY(wiphy_mac) __field(bool, no_cck) @@ -2661,7 +2661,7 @@ TRACE_EVENT(cfg80211_scan_done, memcpy(__get_dynamic_array(ie), request->ie, request->ie_len); memcpy(__entry->rates, request->rates, - IEEE80211_NUM_BANDS); + NUM_NL80211_BANDS); __entry->wdev_id = request->wdev ? request->wdev->identifier : 0; if (request->wiphy) diff --git a/net/wireless/util.c b/net/wireless/util.c index 3e7525d0d8e3..95eab2690f4f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -48,7 +48,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, if (WARN_ON(!sband)) return 1; - if (sband->band == IEEE80211_BAND_2GHZ) { + if (sband->band == NL80211_BAND_2GHZ) { if (scan_width == NL80211_BSS_CHAN_WIDTH_5 || scan_width == NL80211_BSS_CHAN_WIDTH_10) mandatory_flag = IEEE80211_RATE_MANDATORY_G; @@ -66,26 +66,26 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, } EXPORT_SYMBOL(ieee80211_mandatory_rates); -int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band) +int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { - case IEEE80211_BAND_2GHZ: + case NL80211_BAND_2GHZ: if (chan == 14) return 2484; else if (chan < 14) return 2407 + chan * 5; break; - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return 4000 + chan * 5; else return 5000 + chan * 5; break; - case IEEE80211_BAND_60GHZ: + case NL80211_BAND_60GHZ: if (chan < 5) return 56160 + chan * 2160; break; @@ -117,11 +117,11 @@ EXPORT_SYMBOL(ieee80211_frequency_to_channel); struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, int freq) { - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *sband; int i; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) @@ -138,12 +138,12 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, EXPORT_SYMBOL(__ieee80211_get_channel); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, - enum ieee80211_band band) + enum nl80211_band band) { int i, want; switch (band) { - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || @@ -156,7 +156,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, } WARN_ON(want); break; - case IEEE80211_BAND_2GHZ: + case NL80211_BAND_2GHZ: want = 7; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 10) { @@ -186,12 +186,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, } WARN_ON(want != 0 && want != 3 && want != 6); break; - case IEEE80211_BAND_60GHZ: + case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; - case IEEE80211_NUM_BANDS: + case NUM_NL80211_BANDS: WARN_ON(1); break; } @@ -199,9 +199,9 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { - enum ieee80211_band band; + enum nl80211_band band; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band], band); } @@ -410,8 +410,8 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); -int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype) +static int __ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype, bool is_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u16 hdrlen, ethertype; @@ -505,7 +505,7 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, payload = skb->data + hdrlen; ethertype = (payload[6] << 8) | payload[7]; - if (likely((ether_addr_equal(payload, rfc1042_header) && + if (likely((!is_amsdu && ether_addr_equal(payload, rfc1042_header) && ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || ether_addr_equal(payload, bridge_tunnel_header))) { /* remove RFC1042 or Bridge-Tunnel encapsulation and @@ -526,6 +526,12 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, } return 0; } + +int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype) +{ + return __ieee80211_data_to_8023(skb, addr, iftype, false); +} EXPORT_SYMBOL(ieee80211_data_to_8023); int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, @@ -685,6 +691,9 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, /* the last MSDU has no padding */ if (subframe_len > remaining) goto purge; + /* mitigate A-MSDU aggregation injection attacks */ + if (ether_addr_equal(eth->h_dest, rfc1042_header)) + goto purge; skb_pull(skb, sizeof(struct ethhdr)); /* reuse skb for the last subframe */ @@ -950,6 +959,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, switch (otype) { case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, true); break; case NL80211_IFTYPE_ADHOC: @@ -965,6 +975,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; + case NL80211_IFTYPE_OCB: + cfg80211_leave_ocb(rdev, dev); + break; default: break; } @@ -1330,22 +1343,22 @@ size_t ieee80211_ie_split(const u8 *ies, size_t ielen, EXPORT_SYMBOL(ieee80211_ie_split); bool ieee80211_operating_class_to_band(u8 operating_class, - enum ieee80211_band *band) + enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: - *band = IEEE80211_BAND_5GHZ; + *band = NL80211_BAND_5GHZ; return true; case 81: case 82: case 83: case 84: - *band = IEEE80211_BAND_2GHZ; + *band = NL80211_BAND_2GHZ; return true; case 180: - *band = IEEE80211_BAND_60GHZ; + *band = NL80211_BAND_60GHZ; return true; } @@ -1818,10 +1831,10 @@ int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { - enum ieee80211_band band; + enum nl80211_band band; unsigned int n_channels = 0; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; @@ -1908,3 +1921,48 @@ bool cfg80211_is_gratuitous_arp_unsolicited_na(struct sk_buff *skb) return false; } EXPORT_SYMBOL(cfg80211_is_gratuitous_arp_unsolicited_na); + +/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ +struct iapp_layer2_update { + u8 da[ETH_ALEN]; /* broadcast */ + u8 sa[ETH_ALEN]; /* STA addr */ + __be16 len; /* 6 */ + u8 dsap; /* 0 */ + u8 ssap; /* 0 */ + u8 control; + u8 xid_info[3]; +} __packed; + +void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) +{ + struct iapp_layer2_update *msg; + struct sk_buff *skb; + + /* Send Level 2 Update Frame to update forwarding tables in layer 2 + * bridge devices */ + + skb = dev_alloc_skb(sizeof(*msg)); + if (!skb) + return; + msg = (struct iapp_layer2_update *)skb_put(skb, sizeof(*msg)); + + /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) + * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ + + eth_broadcast_addr(msg->da); + ether_addr_copy(msg->sa, addr); + msg->len = htons(6); + msg->dsap = 0; + msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ + msg->control = 0xaf; /* XID response lsb.1111F101. + * F=0 (no poll command; unsolicited frame) */ + msg->xid_info[0] = 0x81; /* XID format identifier */ + msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ + msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ + + skb->dev = dev; + skb->protocol = eth_type_trans(skb, dev); + memset(skb->cb, 0, sizeof(skb->cb)); + netif_rx_ni(skb); +} +EXPORT_SYMBOL(cfg80211_send_layer2_update); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index cd119943612b..7139f9bcaf0c 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -32,13 +32,13 @@ int cfg80211_wext_giwname(struct net_device *dev, if (!wdev) return -EOPNOTSUPP; - sband = wdev->wiphy->bands[IEEE80211_BAND_5GHZ]; + sband = wdev->wiphy->bands[NL80211_BAND_5GHZ]; if (sband) { is_a = true; is_ht |= sband->ht_cap.ht_supported; } - sband = wdev->wiphy->bands[IEEE80211_BAND_2GHZ]; + sband = wdev->wiphy->bands[NL80211_BAND_2GHZ]; if (sband) { int i; /* Check for mandatory rates */ @@ -143,7 +143,7 @@ int cfg80211_wext_giwrange(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_range *range = (struct iw_range *) extra; - enum ieee80211_band band; + enum nl80211_band band; int i, c = 0; if (!wdev) @@ -215,7 +215,7 @@ int cfg80211_wext_giwrange(struct net_device *dev, } } - for (band = 0; band < IEEE80211_NUM_BANDS; band ++) { + for (band = 0; band < NUM_NL80211_BANDS; band ++) { struct ieee80211_supported_band *sband; sband = wdev->wiphy->bands[band]; @@ -265,11 +265,11 @@ int cfg80211_wext_freq(struct iw_freq *freq) * -EINVAL for impossible things. */ if (freq->e == 0) { - enum ieee80211_band band = IEEE80211_BAND_2GHZ; + enum nl80211_band band = NL80211_BAND_2GHZ; if (freq->m < 0) return 0; if (freq->m > 14) - band = IEEE80211_BAND_5GHZ; + band = NL80211_BAND_5GHZ; return ieee80211_channel_to_frequency(freq->m, band); } else { int i, div = 1000000; @@ -1245,7 +1245,7 @@ static int cfg80211_wext_siwrate(struct net_device *dev, maxrate = rate->value / 100000; } - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wdev->wiphy->bands[band]; if (sband == NULL) continue; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index b50ee5d622e1..9a929010ea9d 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -656,7 +656,8 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) return NULL; } -static int iw_handler_get_iwstats(struct net_device * dev, +/* noinline to avoid a bogus warning with -O3 */ +static noinline int iw_handler_get_iwstats(struct net_device * dev, struct iw_request_info * info, union iwreq_data * wrqu, char * extra) @@ -894,8 +895,9 @@ out: int call_commit_handler(struct net_device *dev) { #ifdef CONFIG_WIRELESS_EXT - if ((netif_running(dev)) && - (dev->wireless_handlers->standard[0] != NULL)) + if (netif_running(dev) && + dev->wireless_handlers && + dev->wireless_handlers->standard[0]) /* Call the commit handler on the driver */ return dev->wireless_handlers->standard[0](dev, NULL, NULL, NULL); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 98ff9d9e1aa9..8cc9a5f406ee 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -43,7 +43,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, if (!wdev->wext.connect.ssid_len) return 0; - if (wdev->wext.keys) { + if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c index 33bef22e44e9..b379a0371653 100644 --- a/net/wireless/wext-spy.c +++ b/net/wireless/wext-spy.c @@ -120,8 +120,8 @@ int iw_handler_set_thrspy(struct net_device * dev, return -EOPNOTSUPP; /* Just do it */ - memcpy(&(spydata->spy_thr_low), &(threshold->low), - 2 * sizeof(struct iw_quality)); + spydata->spy_thr_low = threshold->low; + spydata->spy_thr_high = threshold->high; /* Clear flag */ memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); @@ -147,8 +147,8 @@ int iw_handler_get_thrspy(struct net_device * dev, return -EOPNOTSUPP; /* Just do it */ - memcpy(&(threshold->low), &(spydata->spy_thr_low), - 2 * sizeof(struct iw_quality)); + threshold->low = spydata->spy_thr_low; + threshold->high = spydata->spy_thr_high; return 0; } @@ -173,10 +173,10 @@ static void iw_send_thrspy_event(struct net_device * dev, memcpy(threshold.addr.sa_data, address, ETH_ALEN); threshold.addr.sa_family = ARPHRD_ETHER; /* Copy stats */ - memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality)); + threshold.qual = *wstats; /* Copy also thresholds */ - memcpy(&(threshold.low), &(spydata->spy_thr_low), - 2 * sizeof(struct iw_quality)); + threshold.low = spydata->spy_thr_low; + threshold.high = spydata->spy_thr_high; /* Send event to user space */ wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5dca42dbc737..156639be7ed0 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -100,7 +100,7 @@ int x25_parse_address_block(struct sk_buff *skb, } len = *skb->data; - needed = 1 + (len >> 4) + (len & 0x0f); + needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims @@ -288,7 +288,7 @@ static struct sock *x25_find_listener(struct x25_address *addr, sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || - !strcmp(addr->x25_addr, + !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* @@ -550,7 +550,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol, if (protocol) goto out; - rc = -ENOBUFS; + rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; @@ -679,16 +679,21 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || - addr->sx25_family != AF_X25) { + addr->sx25_family != AF_X25 || + strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } - len = strlen(addr->sx25_addr.x25_addr); - for (i = 0; i < len; i++) { - if (!isdigit(addr->sx25_addr.x25_addr[i])) { - rc = -EINVAL; - goto out; + /* check for the null_x25_address */ + if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { + + len = strlen(addr->sx25_addr.x25_addr); + for (i = 0; i < len; i++) { + if (!isdigit(addr->sx25_addr.x25_addr[i])) { + rc = -EINVAL; + goto out; + } } } @@ -760,12 +765,17 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state == TCP_ESTABLISHED) goto out; + rc = -EALREADY; /* Do nothing if call is already in progress */ + if (sk->sk_state == TCP_SYN_SENT) + goto out; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || - addr->sx25_family != AF_X25) + addr->sx25_family != AF_X25 || + strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; @@ -806,7 +816,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) - goto out_put_neigh; + goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) @@ -815,7 +825,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; rc = 0; out_put_neigh: - if (rc) { + if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; @@ -1039,6 +1049,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; + x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 39231237e1c3..30f71620d4e3 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -120,8 +120,10 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, goto drop; } - if (!pskb_may_pull(skb, 1)) + if (!pskb_may_pull(skb, 1)) { + x25_neigh_put(nb); return 0; + } switch (skb->data[0]) { diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 6b5af65f491f..a3163645b5bd 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -368,6 +368,12 @@ void x25_disconnect(struct sock *sk, int reason, unsigned char cause, sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } + if (x25->neighbour) { + read_lock_bh(&x25_list_lock); + x25_neigh_put(x25->neighbour); + x25->neighbour = NULL; + read_unlock_bh(&x25_list_lock); + } } /* diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index bda1a13628a8..16cdd2c9221f 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -20,6 +20,17 @@ config XFRM_USER If unsure, say Y. +config XFRM_USER_COMPAT + tristate "Compatible ABI support" + depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT && \ + HAVE_EFFICIENT_UNALIGNED_ACCESS + select WANT_COMPAT_NETLINK_MESSAGES + help + Transformation(XFRM) user configuration interface like IPsec + used by compatible Linux applications. + + If unsure, say N. + config XFRM_SUB_POLICY bool "Transformation sub policy support" depends on XFRM diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index c0e961983f17..516c78e22e2a 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -8,4 +8,5 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o +obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c new file mode 100644 index 000000000000..7158078a71f1 --- /dev/null +++ b/net/xfrm/xfrm_compat.c @@ -0,0 +1,685 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM compat layer + * Author: Dmitry Safonov <dima@arista.com> + * Based on code and translator idea by: Florian Westphal <fw@strlen.de> + */ +#include <linux/compat.h> +#include <linux/vmalloc.h> +#include <linux/xfrm.h> +#include <net/xfrm.h> + +struct compat_xfrm_lifetime_cfg { + compat_u64 soft_byte_limit, hard_byte_limit; + compat_u64 soft_packet_limit, hard_packet_limit; + compat_u64 soft_add_expires_seconds, hard_add_expires_seconds; + compat_u64 soft_use_expires_seconds, hard_use_expires_seconds; +}; /* same size on 32bit, but only 4 byte alignment required */ + +struct compat_xfrm_lifetime_cur { + compat_u64 bytes, packets, add_time, use_time; +}; /* same size on 32bit, but only 4 byte alignment required */ + +struct compat_xfrm_userpolicy_info { + struct xfrm_selector sel; + struct compat_xfrm_lifetime_cfg lft; + struct compat_xfrm_lifetime_cur curlft; + __u32 priority, index; + u8 dir, action, flags, share; + /* 4 bytes additional padding on 64bit */ +}; + +struct compat_xfrm_usersa_info { + struct xfrm_selector sel; + struct xfrm_id id; + xfrm_address_t saddr; + struct compat_xfrm_lifetime_cfg lft; + struct compat_xfrm_lifetime_cur curlft; + struct xfrm_stats stats; + __u32 seq, reqid; + u16 family; + u8 mode, replay_window, flags; + /* 4 bytes additional padding on 64bit */ +}; + +struct compat_xfrm_user_acquire { + struct xfrm_id id; + xfrm_address_t saddr; + struct xfrm_selector sel; + struct compat_xfrm_userpolicy_info policy; + /* 4 bytes additional padding on 64bit */ + __u32 aalgos, ealgos, calgos, seq; +}; + +struct compat_xfrm_userspi_info { + struct compat_xfrm_usersa_info info; + /* 4 bytes additional padding on 64bit */ + __u32 min, max; +}; + +struct compat_xfrm_user_expire { + struct compat_xfrm_usersa_info state; + /* 8 bytes additional padding on 64bit */ + u8 hard; +}; + +struct compat_xfrm_user_polexpire { + struct compat_xfrm_userpolicy_info pol; + /* 8 bytes additional padding on 64bit */ + u8 hard; +}; + +#define XMSGSIZE(type) sizeof(struct type) + +static const int compat_msg_min[XFRM_NR_MSGTYPES] = { + [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info), + [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info), + [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userspi_info), + [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_acquire), + [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_expire), + [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info), + [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info), + [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_polexpire), + [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush), + [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0, + [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), + [XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), + [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), + [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_NEWSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping) +}; + +static const struct nla_policy compat_policy[XFRMA_MAX+1] = { + [XFRMA_SA] = { .len = XMSGSIZE(compat_xfrm_usersa_info)}, + [XFRMA_POLICY] = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)}, + [XFRMA_LASTUSED] = { .type = NLA_U64}, + [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)}, + [XFRMA_ALG_AEAD] = { .len = sizeof(struct xfrm_algo_aead) }, + [XFRMA_ALG_AUTH] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, + [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, + [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, + [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, + [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)}, + [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) }, + [XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) }, + [XFRMA_MARK] = { .len = sizeof(struct xfrm_mark) }, + [XFRMA_TFCPAD] = { .type = NLA_U32 }, + [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) }, + [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, + [XFRMA_PROTO] = { .type = NLA_U8 }, + [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, +}; + +static inline void *kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) +{ +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) + return true; +#endif + return false; +} + +static inline int nla_total_size_64bit(int payload) +{ + return NLA_ALIGN(nla_attr_size(payload)) +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + + NLA_ALIGN(nla_attr_size(0)) +#endif + ; +} + +static inline int nla_align_64bit(struct sk_buff *skb, int padattr) +{ + if (nla_need_padding_for_64bit(skb) && + !nla_reserve(skb, padattr, 0)) + return -EMSGSIZE; + + return 0; +} + +struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, + int attrlen, int padattr) +{ + nla_align_64bit(skb, padattr); + + return __nla_reserve(skb, attrtype, attrlen); +} + +static inline void __nla_put_64bit(struct sk_buff *skb, int attrtype, + int attrlen, const void *data, int padattr) +{ + struct nlattr *nla; + + nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); + memcpy(nla_data(nla), data, attrlen); +} + +static inline int nla_put_64bit(struct sk_buff *skb, int attrtype, + int attrlen, const void *data, int padattr) +{ + size_t len; + + if (nla_need_padding_for_64bit(skb)) + len = nla_total_size_64bit(attrlen); + else + len = nla_total_size(attrlen); + if (unlikely(skb_tailroom(skb) < len)) + return -EMSGSIZE; + + __nla_put_64bit(skb, attrtype, attrlen, data, padattr); + return 0; +} + +static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, + const struct nlmsghdr *nlh_src, u16 type) +{ + int payload = compat_msg_min[type]; + int src_len = xfrm_msg_min[type]; + struct nlmsghdr *nlh_dst; + + /* Compat messages are shorter or equal to native (+padding) */ + if (WARN_ON_ONCE(src_len < payload)) + return ERR_PTR(-EMSGSIZE); + + nlh_dst = nlmsg_put(skb, nlh_src->nlmsg_pid, nlh_src->nlmsg_seq, + nlh_src->nlmsg_type, payload, nlh_src->nlmsg_flags); + if (!nlh_dst) + return ERR_PTR(-EMSGSIZE); + + memset(nlmsg_data(nlh_dst), 0, payload); + + switch (nlh_src->nlmsg_type) { + /* Compat message has the same layout as native */ + case XFRM_MSG_DELSA: + case XFRM_MSG_DELPOLICY: + case XFRM_MSG_FLUSHSA: + case XFRM_MSG_FLUSHPOLICY: + case XFRM_MSG_NEWAE: + case XFRM_MSG_REPORT: + case XFRM_MSG_MIGRATE: + case XFRM_MSG_NEWSADINFO: + case XFRM_MSG_NEWSPDINFO: + case XFRM_MSG_MAPPING: + WARN_ON_ONCE(src_len != payload); + memcpy(nlmsg_data(nlh_dst), nlmsg_data(nlh_src), src_len); + break; + /* 4 byte alignment for trailing u64 on native, but not on compat */ + case XFRM_MSG_NEWSA: + case XFRM_MSG_NEWPOLICY: + case XFRM_MSG_UPDSA: + case XFRM_MSG_UPDPOLICY: + WARN_ON_ONCE(src_len != payload + 4); + memcpy(nlmsg_data(nlh_dst), nlmsg_data(nlh_src), payload); + break; + case XFRM_MSG_EXPIRE: { + const struct xfrm_user_expire *src_ue = nlmsg_data(nlh_src); + struct compat_xfrm_user_expire *dst_ue = nlmsg_data(nlh_dst); + + /* compat_xfrm_user_expire has 4-byte smaller state */ + memcpy(dst_ue, src_ue, sizeof(dst_ue->state)); + dst_ue->hard = src_ue->hard; + break; + } + case XFRM_MSG_ACQUIRE: { + const struct xfrm_user_acquire *src_ua = nlmsg_data(nlh_src); + struct compat_xfrm_user_acquire *dst_ua = nlmsg_data(nlh_dst); + + memcpy(dst_ua, src_ua, offsetof(struct compat_xfrm_user_acquire, aalgos)); + dst_ua->aalgos = src_ua->aalgos; + dst_ua->ealgos = src_ua->ealgos; + dst_ua->calgos = src_ua->calgos; + dst_ua->seq = src_ua->seq; + break; + } + case XFRM_MSG_POLEXPIRE: { + const struct xfrm_user_polexpire *src_upe = nlmsg_data(nlh_src); + struct compat_xfrm_user_polexpire *dst_upe = nlmsg_data(nlh_dst); + + /* compat_xfrm_user_polexpire has 4-byte smaller state */ + memcpy(dst_upe, src_upe, sizeof(dst_upe->pol)); + dst_upe->hard = src_upe->hard; + break; + } + case XFRM_MSG_ALLOCSPI: { + const struct xfrm_userspi_info *src_usi = nlmsg_data(nlh_src); + struct compat_xfrm_userspi_info *dst_usi = nlmsg_data(nlh_dst); + + /* compat_xfrm_user_polexpire has 4-byte smaller state */ + memcpy(dst_usi, src_usi, sizeof(src_usi->info)); + dst_usi->min = src_usi->min; + dst_usi->max = src_usi->max; + break; + } + /* Not being sent by kernel */ + case XFRM_MSG_GETSA: + case XFRM_MSG_GETPOLICY: + case XFRM_MSG_GETAE: + case XFRM_MSG_GETSADINFO: + case XFRM_MSG_GETSPDINFO: + default: + WARN_ONCE(1, "unsupported nlmsg_type %d", nlh_src->nlmsg_type); + return ERR_PTR(-EOPNOTSUPP); + } + + return nlh_dst; +} + +static int xfrm_nla_cpy(struct sk_buff *dst, const struct nlattr *src, int len) +{ + return nla_put(dst, src->nla_type, len, nla_data(src)); +} + +static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) +{ + switch (src->nla_type) { + case XFRMA_PAD: + case XFRMA_OFFLOAD_DEV: + /* Ignore */ + return 0; + case XFRMA_ALG_AUTH: + case XFRMA_ALG_CRYPT: + case XFRMA_ALG_COMP: + case XFRMA_ENCAP: + case XFRMA_TMPL: + return xfrm_nla_cpy(dst, src, nla_len(src)); + case XFRMA_SA: + return xfrm_nla_cpy(dst, src, XMSGSIZE(compat_xfrm_usersa_info)); + case XFRMA_POLICY: + return xfrm_nla_cpy(dst, src, XMSGSIZE(compat_xfrm_userpolicy_info)); + case XFRMA_SEC_CTX: + return xfrm_nla_cpy(dst, src, nla_len(src)); + case XFRMA_LTIME_VAL: + return nla_put_64bit(dst, src->nla_type, nla_len(src), + nla_data(src), XFRMA_PAD); + case XFRMA_REPLAY_VAL: + case XFRMA_REPLAY_THRESH: + case XFRMA_ETIMER_THRESH: + case XFRMA_SRCADDR: + case XFRMA_COADDR: + return xfrm_nla_cpy(dst, src, nla_len(src)); + case XFRMA_LASTUSED: + return nla_put_64bit(dst, src->nla_type, nla_len(src), + nla_data(src), XFRMA_PAD); + case XFRMA_POLICY_TYPE: + case XFRMA_MIGRATE: + case XFRMA_ALG_AEAD: + case XFRMA_KMADDRESS: + case XFRMA_ALG_AUTH_TRUNC: + case XFRMA_MARK: + case XFRMA_TFCPAD: + case XFRMA_REPLAY_ESN_VAL: + case XFRMA_SA_EXTRA_FLAGS: + case XFRMA_PROTO: + case XFRMA_ADDRESS_FILTER: + case XFRMA_OUTPUT_MARK: + return xfrm_nla_cpy(dst, src, nla_len(src)); + default: + BUILD_BUG_ON(XFRMA_MAX != XFRMA_OUTPUT_MARK); + WARN_ONCE(1, "unsupported nla_type %d", src->nla_type); + return -EOPNOTSUPP; + } +} + +/* Take kernel-built (64bit layout) and create 32bit layout for userspace */ +static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src) +{ + u16 type = nlh_src->nlmsg_type - XFRM_MSG_BASE; + const struct nlattr *nla, *attrs; + struct nlmsghdr *nlh_dst; + int len, remaining; + + nlh_dst = xfrm_nlmsg_put_compat(dst, nlh_src, type); + if (IS_ERR(nlh_dst)) + return PTR_ERR(nlh_dst); + + attrs = nlmsg_attrdata(nlh_src, xfrm_msg_min[type]); + len = nlmsg_attrlen(nlh_src, xfrm_msg_min[type]); + + nla_for_each_attr(nla, attrs, len, remaining) { + int err = xfrm_xlate64_attr(dst, nla); + + if (err) + return err; + } + + nlmsg_end(dst, nlh_dst); + + return 0; +} + +static int xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src) +{ + u16 type = nlh_src->nlmsg_type - XFRM_MSG_BASE; + struct sk_buff *new = NULL; + int err; + + if (WARN_ON_ONCE(type >= ARRAY_SIZE(xfrm_msg_min))) + return -EOPNOTSUPP; + + if (skb_shinfo(skb)->frag_list == NULL) { + new = alloc_skb(skb->len + skb_tailroom(skb), GFP_ATOMIC); + if (!new) + return -ENOMEM; + skb_shinfo(skb)->frag_list = new; + } + + err = xfrm_xlate64(skb_shinfo(skb)->frag_list, nlh_src); + if (err) { + if (new) { + kfree_skb(new); + skb_shinfo(skb)->frag_list = NULL; + } + return err; + } + + return 0; +} + +/* Calculates len of translated 64-bit message. */ +static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src, + struct nlattr *attrs[XFRMA_MAX+1]) +{ + size_t len = nlmsg_len(src); + + switch (src->nlmsg_type) { + case XFRM_MSG_NEWSA: + case XFRM_MSG_NEWPOLICY: + case XFRM_MSG_ALLOCSPI: + case XFRM_MSG_ACQUIRE: + case XFRM_MSG_UPDPOLICY: + case XFRM_MSG_UPDSA: + len += 4; + break; + case XFRM_MSG_EXPIRE: + case XFRM_MSG_POLEXPIRE: + len += 8; + break; + default: + break; + } + + if (attrs[XFRMA_SA]) + len += 4; + if (attrs[XFRMA_POLICY]) + len += 4; + + /* XXX: some attrs may need to be realigned + * if !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + */ + + return len; +} + +static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src, + size_t size, int copy_len, int payload) +{ + struct nlmsghdr *nlmsg = dst; + struct nlattr *nla; + + if (WARN_ON_ONCE(copy_len > payload)) + copy_len = payload; + + if (size - *pos < nla_attr_size(payload)) + return -ENOBUFS; + + nla = dst + *pos; + + memcpy(nla, src, nla_attr_size(copy_len)); + nla->nla_len = nla_attr_size(payload); + *pos += nla_attr_size(payload); + nlmsg->nlmsg_len += nla->nla_len; + + memset(dst + *pos, 0, payload - copy_len); + *pos += payload - copy_len; + + return 0; +} + +static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, + size_t *pos, size_t size) +{ + int type = nla_type(nla); + u16 pol_len32, pol_len64; + int err; + + if (type > XFRMA_MAX) { + BUILD_BUG_ON(XFRMA_MAX != XFRMA_OUTPUT_MARK); + return -EOPNOTSUPP; + } + if (nla_len(nla) < compat_policy[type].len) { + return -EOPNOTSUPP; + } + + pol_len32 = compat_policy[type].len; + pol_len64 = xfrma_policy[type].len; + + /* XFRMA_SA and XFRMA_POLICY - need to know how-to translate */ + if (pol_len32 != pol_len64) { + if (nla_len(nla) != compat_policy[type].len) { + return -EOPNOTSUPP; + } + err = xfrm_attr_cpy32(dst, pos, nla, size, pol_len32, pol_len64); + if (err) + return err; + } + + return xfrm_attr_cpy32(dst, pos, nla, size, nla_len(nla), nla_len(nla)); +} + +static int xfrm_xlate32(struct nlmsghdr *dst, const struct nlmsghdr *src, + struct nlattr *attrs[XFRMA_MAX+1], + size_t size, u8 type) +{ + size_t pos; + int i; + + memcpy(dst, src, NLMSG_HDRLEN); + dst->nlmsg_len = NLMSG_HDRLEN + xfrm_msg_min[type]; + memset(nlmsg_data(dst), 0, xfrm_msg_min[type]); + + switch (src->nlmsg_type) { + /* Compat message has the same layout as native */ + case XFRM_MSG_DELSA: + case XFRM_MSG_GETSA: + case XFRM_MSG_DELPOLICY: + case XFRM_MSG_GETPOLICY: + case XFRM_MSG_FLUSHSA: + case XFRM_MSG_FLUSHPOLICY: + case XFRM_MSG_NEWAE: + case XFRM_MSG_GETAE: + case XFRM_MSG_REPORT: + case XFRM_MSG_MIGRATE: + case XFRM_MSG_NEWSADINFO: + case XFRM_MSG_GETSADINFO: + case XFRM_MSG_NEWSPDINFO: + case XFRM_MSG_GETSPDINFO: + case XFRM_MSG_MAPPING: + memcpy(nlmsg_data(dst), nlmsg_data(src), compat_msg_min[type]); + break; + /* 4 byte alignment for trailing u64 on native, but not on compat */ + case XFRM_MSG_NEWSA: + case XFRM_MSG_NEWPOLICY: + case XFRM_MSG_UPDSA: + case XFRM_MSG_UPDPOLICY: + memcpy(nlmsg_data(dst), nlmsg_data(src), compat_msg_min[type]); + break; + case XFRM_MSG_EXPIRE: { + const struct compat_xfrm_user_expire *src_ue = nlmsg_data(src); + struct xfrm_user_expire *dst_ue = nlmsg_data(dst); + + /* compat_xfrm_user_expire has 4-byte smaller state */ + memcpy(dst_ue, src_ue, sizeof(src_ue->state)); + dst_ue->hard = src_ue->hard; + break; + } + case XFRM_MSG_ACQUIRE: { + const struct compat_xfrm_user_acquire *src_ua = nlmsg_data(src); + struct xfrm_user_acquire *dst_ua = nlmsg_data(dst); + + memcpy(dst_ua, src_ua, offsetof(struct compat_xfrm_user_acquire, aalgos)); + dst_ua->aalgos = src_ua->aalgos; + dst_ua->ealgos = src_ua->ealgos; + dst_ua->calgos = src_ua->calgos; + dst_ua->seq = src_ua->seq; + break; + } + case XFRM_MSG_POLEXPIRE: { + const struct compat_xfrm_user_polexpire *src_upe = nlmsg_data(src); + struct xfrm_user_polexpire *dst_upe = nlmsg_data(dst); + + /* compat_xfrm_user_polexpire has 4-byte smaller state */ + memcpy(dst_upe, src_upe, sizeof(src_upe->pol)); + dst_upe->hard = src_upe->hard; + break; + } + case XFRM_MSG_ALLOCSPI: { + const struct compat_xfrm_userspi_info *src_usi = nlmsg_data(src); + struct xfrm_userspi_info *dst_usi = nlmsg_data(dst); + + /* compat_xfrm_user_polexpire has 4-byte smaller state */ + memcpy(dst_usi, src_usi, sizeof(src_usi->info)); + dst_usi->min = src_usi->min; + dst_usi->max = src_usi->max; + break; + } + default: + return -EOPNOTSUPP; + } + pos = dst->nlmsg_len; + + for (i = 1; i < XFRMA_MAX + 1; i++) { + int err; + + if (i == XFRMA_PAD || i == XFRMA_OFFLOAD_DEV) + continue; + + if (!attrs[i]) + continue; + + err = xfrm_xlate32_attr(dst, attrs[i], &pos, size); + if (err) + return err; + } + + return 0; +} + +static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, + int maxtype, const struct nla_policy *policy) +{ + /* netlink_rcv_skb() checks if a message has full (struct nlmsghdr) */ + u16 type = h32->nlmsg_type - XFRM_MSG_BASE; + struct nlattr *attrs[XFRMA_MAX+1]; + struct nlmsghdr *h64; + size_t len; + int err; + + BUILD_BUG_ON(ARRAY_SIZE(xfrm_msg_min) != ARRAY_SIZE(compat_msg_min)); + + if (type >= ARRAY_SIZE(xfrm_msg_min)) + return ERR_PTR(-EINVAL); + + /* Don't call parse: the message might have only nlmsg header */ + if ((h32->nlmsg_type == XFRM_MSG_GETSA || + h32->nlmsg_type == XFRM_MSG_GETPOLICY) && + (h32->nlmsg_flags & NLM_F_DUMP)) + return NULL; + + err = nlmsg_parse(h32, compat_msg_min[type], attrs, + maxtype ? : XFRMA_MAX, policy ? : compat_policy); + if (err < 0) + return ERR_PTR(err); + + len = xfrm_user_rcv_calculate_len64(h32, attrs); + /* The message doesn't need translation */ + if (len == nlmsg_len(h32)) + return NULL; + + len += NLMSG_HDRLEN; + h64 = kvmalloc(len, GFP_KERNEL | __GFP_ZERO); + if (!h64) + return ERR_PTR(-ENOMEM); + + err = xfrm_xlate32(h64, h32, attrs, len, type); + if (err < 0) { + kvfree(h64); + return ERR_PTR(err); + } + + return h64; +} + +static int xfrm_user_policy_compat(u8 **pdata32, int optlen) +{ + struct compat_xfrm_userpolicy_info *p = (void *)*pdata32; + u8 *src_templates, *dst_templates; + u8 *data64; + + if (optlen < sizeof(*p)) + return -EINVAL; + + data64 = kmalloc_track_caller(optlen + 4, GFP_USER | __GFP_NOWARN); + if (!data64) + return -ENOMEM; + + memcpy(data64, *pdata32, sizeof(*p)); + memset(data64 + sizeof(*p), 0, 4); + + src_templates = *pdata32 + sizeof(*p); + dst_templates = data64 + sizeof(*p) + 4; + memcpy(dst_templates, src_templates, optlen - sizeof(*p)); + + kfree(*pdata32); + *pdata32 = data64; + return 0; +} + +static struct xfrm_translator xfrm_translator = { + .owner = THIS_MODULE, + .alloc_compat = xfrm_alloc_compat, + .rcv_msg_compat = xfrm_user_rcv_msg_compat, + .xlate_user_policy_sockptr = xfrm_user_policy_compat, +}; + +static int __init xfrm_compat_init(void) +{ + return xfrm_register_translator(&xfrm_translator); +} + +static void __exit xfrm_compat_exit(void) +{ + xfrm_unregister_translator(&xfrm_translator); +} + +module_init(xfrm_compat_init); +module_exit(xfrm_compat_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dmitry Safonov"); +MODULE_DESCRIPTION("XFRM 32-bit compatibility layer"); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 1c4ad477ce93..d613bf77cc0f 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -207,15 +207,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) family = XFRM_SPI_SKB_CB(skb)->family; /* if tunnel is present override skb->mark value with tunnel i_key */ - if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) { - switch (family) { - case AF_INET: + switch (family) { + case AF_INET: + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); - break; - case AF_INET6: + break; + case AF_INET6: + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); - break; - } + break; } /* Allocate new secpath or COW existing one. */ @@ -302,7 +302,7 @@ resume: dev_put(skb->dev); spin_lock(&x->lock); - if (nexthdr <= 0) { + if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, x->type->proto); @@ -315,7 +315,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (async && x->repl->recheck(x, skb, seq)) { + if (x->repl->recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 16e828f2540f..6986da7e4ce8 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -240,7 +240,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu) if (skb->protocol == htons(ETH_P_IP)) proto = AF_INET; - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (skb->protocol == htons(ETH_P_IPV6) && + skb->sk->sk_family == AF_INET6) proto = AF_INET6; else return; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e098ca928538..2c34636218c0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -330,7 +330,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + write_lock_bh(&policy->lock); policy->walk.dead = 1; + write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); @@ -738,12 +740,7 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, static bool xfrm_policy_mark_match(struct xfrm_policy *policy, struct xfrm_policy *pol) { - u32 mark = policy->mark.v & policy->mark.m; - - if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m) - return true; - - if ((mark & pol->mark.m) == pol->mark.v && + if (policy->mark.v == pol->mark.v && policy->priority == pol->priority) return true; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7d50a371574c..27ececc3fa37 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -332,6 +332,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); + kfree(x->aead); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); @@ -741,8 +742,10 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, */ if (x->km.state == XFRM_STATE_VALID) { if ((x->sel.family && - !xfrm_selector_match(&x->sel, fl, x->sel.family)) || - !security_xfrm_state_pol_flow_match(x, pol, fl)) + (x->sel.family != family || + !xfrm_selector_match(&x->sel, fl, family))) || + !security_xfrm_state_pol_flow_match(x, pol, + &fl->u.__fl_common)) return; if (!*best || @@ -754,8 +757,11 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, x->sel.family) && - security_xfrm_state_pol_flow_match(x, pol, fl)) + if ((!x->sel.family || + (x->sel.family == family && + xfrm_selector_match(&x->sel, fl, family))) && + security_xfrm_state_pol_flow_match(x, pol, + &fl->u.__fl_common)) *error = -ESRCH; } } @@ -790,7 +796,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, encap_family, + xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } if (best || acquire_in_progress) @@ -806,7 +812,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, encap_family, + xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } @@ -1206,7 +1212,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig) x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; - x->curlft.add_time = orig->curlft.add_time; + memcpy(&x->curlft, &orig->curlft, sizeof(x->curlft)); x->km.state = orig->km.state; x->km.seq = orig->km.seq; x->replay = orig->replay; @@ -1559,6 +1565,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) int err = -ENOENT; __be32 minspi = htonl(low); __be32 maxspi = htonl(high); + __be32 newspi = 0; u32 mark = x->mark.v & x->mark.m; spin_lock_bh(&x->lock); @@ -1577,21 +1584,22 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) xfrm_state_put(x0); goto unlock; } - x->id.spi = minspi; + newspi = minspi; } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { spi = low + prandom_u32()%(high-low+1); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { - x->id.spi = htonl(spi); + newspi = htonl(spi); break; } xfrm_state_put(x0); } } - if (x->id.spi) { + if (newspi) { spin_lock_bh(&net->xfrm.xfrm_state_lock); + x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); spin_unlock_bh(&net->xfrm.xfrm_state_lock); @@ -1850,6 +1858,66 @@ bool km_is_alive(const struct km_event *c) } EXPORT_SYMBOL(km_is_alive); +#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT) +static DEFINE_SPINLOCK(xfrm_translator_lock); +static struct xfrm_translator __rcu *xfrm_translator; + +struct xfrm_translator *xfrm_get_translator(void) +{ + struct xfrm_translator *xtr; + + rcu_read_lock(); + xtr = rcu_dereference(xfrm_translator); + if (unlikely(!xtr)) + goto out; + if (!try_module_get(xtr->owner)) + xtr = NULL; +out: + rcu_read_unlock(); + return xtr; +} +EXPORT_SYMBOL_GPL(xfrm_get_translator); + +void xfrm_put_translator(struct xfrm_translator *xtr) +{ + module_put(xtr->owner); +} +EXPORT_SYMBOL_GPL(xfrm_put_translator); + +int xfrm_register_translator(struct xfrm_translator *xtr) +{ + int err = 0; + + spin_lock_bh(&xfrm_translator_lock); + if (unlikely(xfrm_translator != NULL)) + err = -EEXIST; + else + rcu_assign_pointer(xfrm_translator, xtr); + spin_unlock_bh(&xfrm_translator_lock); + + return err; +} +EXPORT_SYMBOL_GPL(xfrm_register_translator); + +int xfrm_unregister_translator(struct xfrm_translator *xtr) +{ + int err = 0; + + spin_lock_bh(&xfrm_translator_lock); + if (likely(xfrm_translator != NULL)) { + if (rcu_access_pointer(xfrm_translator) != xtr) + err = -EINVAL; + else + RCU_INIT_POINTER(xfrm_translator, NULL); + } + spin_unlock_bh(&xfrm_translator_lock); + synchronize_rcu(); + + return err; +} +EXPORT_SYMBOL_GPL(xfrm_unregister_translator); +#endif + int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) { int err; @@ -1875,6 +1943,23 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen if (copy_from_user(data, optval, optlen)) goto out; + /* Use the 64-bit / untranslated format on Android, even for compat */ + if (!IS_ENABLED(CONFIG_ANDROID) || IS_ENABLED(CONFIG_XFRM_USER_COMPAT)) { + if (is_compat_task()) { + struct xfrm_translator *xtr = xfrm_get_translator(); + + if (!xtr) + return -EOPNOTSUPP; + + err = xtr->xlate_user_policy_sockptr(&data, optlen); + xfrm_put_translator(xtr); + if (err) { + kfree(data); + return err; + } + } + } + err = -EINVAL; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 69126af39c79..3fd866867ce9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -109,7 +109,8 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs) return 0; uctx = nla_data(rt); - if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) + if (uctx->len > nla_len(rt) || + uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) return -EINVAL; return 0; @@ -565,6 +566,20 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, copy_from_user_state(x, p); + if (attrs[XFRMA_ENCAP]) { + x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), + sizeof(*x->encap), GFP_KERNEL); + if (x->encap == NULL) + goto error; + } + + if (attrs[XFRMA_COADDR]) { + x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), + sizeof(*x->coaddr), GFP_KERNEL); + if (x->coaddr == NULL) + goto error; + } + if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); @@ -585,23 +600,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, attrs[XFRMA_ALG_COMP]))) goto error; - if (attrs[XFRMA_ENCAP]) { - x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), - sizeof(*x->encap), GFP_KERNEL); - if (x->encap == NULL) - goto error; - } - if (attrs[XFRMA_TFCPAD]) x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]); - if (attrs[XFRMA_COADDR]) { - x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), - sizeof(*x->coaddr), GFP_KERNEL); - if (x->coaddr == NULL) - goto error; - } - xfrm_mark_get(attrs, &x->mark); if (attrs[XFRMA_OUTPUT_MARK]) @@ -611,9 +612,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (err) goto error; - if (attrs[XFRMA_SEC_CTX] && - security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX]))) - goto error; + if (attrs[XFRMA_SEC_CTX]) { + err = security_xfrm_state_alloc(x, + nla_data(attrs[XFRMA_SEC_CTX])); + if (err) + goto error; + } if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) @@ -904,6 +908,7 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) struct xfrm_dump_info *sp = ptr; struct sk_buff *in_skb = sp->in_skb; struct sk_buff *skb = sp->out_skb; + struct xfrm_translator *xtr; struct xfrm_usersa_info *p; struct nlmsghdr *nlh; int err; @@ -921,6 +926,18 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) return err; } nlmsg_end(skb, nlh); + + xtr = xfrm_get_translator(); + if (xtr) { + err = xtr->alloc_compat(skb, nlh); + + xfrm_put_translator(xtr); + if (err) { + nlmsg_cancel(skb, nlh); + return err; + } + } + return 0; } @@ -930,11 +947,11 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb) struct sock *sk = cb->skb->sk; struct net *net = sock_net(sk); - xfrm_state_walk_done(walk, net); + if (cb->args[0]) + xfrm_state_walk_done(walk, net); return 0; } -static const struct nla_policy xfrma_policy[XFRMA_MAX+1]; static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -955,8 +972,6 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) u8 proto = 0; int err; - cb->args[0] = 1; - err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy); if (err < 0) @@ -973,6 +988,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) proto = nla_get_u8(attrs[XFRMA_PROTO]); xfrm_state_walk_init(walk, proto, filter); + cb->args[0] = 1; } (void) xfrm_state_walk(net, walk, dump_one_state, &info); @@ -1012,12 +1028,24 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, u32 pid, unsigned int group) { struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); + struct xfrm_translator *xtr; if (!nlsk) { kfree_skb(skb); return -EPIPE; } + xtr = xfrm_get_translator(); + if (xtr) { + int err = xtr->alloc_compat(skb, nlmsg_hdr(skb)); + + xfrm_put_translator(xtr); + if (err) { + kfree_skb(skb); + return err; + } + } + return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); } @@ -1235,6 +1263,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct xfrm_state *x; struct xfrm_userspi_info *p; + struct xfrm_translator *xtr; struct sk_buff *resp_skb; xfrm_address_t *daddr; int family; @@ -1280,6 +1309,17 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + xtr = xfrm_get_translator(); + if (xtr) { + err = xtr->alloc_compat(skb, nlmsg_hdr(skb)); + + xfrm_put_translator(xtr); + if (err) { + kfree_skb(resp_skb); + goto out; + } + } + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); out: @@ -1683,6 +1723,7 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr struct xfrm_userpolicy_info *p; struct sk_buff *in_skb = sp->in_skb; struct sk_buff *skb = sp->out_skb; + struct xfrm_translator *xtr; struct nlmsghdr *nlh; int err; @@ -1705,6 +1746,18 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr return err; } nlmsg_end(skb, nlh); + + xtr = xfrm_get_translator(); + if (xtr) { + err = xtr->alloc_compat(skb, nlh); + + xfrm_put_translator(xtr); + if (err) { + nlmsg_cancel(skb, nlh); + return err; + } + } + return 0; } @@ -2186,6 +2239,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, err = verify_newpolicy_info(&ua->policy); if (err) + goto free_state; + err = verify_sec_ctx_len(attrs); + if (err) goto bad_policy; /* build an XP */ @@ -2426,7 +2482,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, #define XMSGSIZE(type) sizeof(struct type) -static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { +const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info), [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), @@ -2449,10 +2505,11 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), }; +EXPORT_SYMBOL_GPL(xfrm_msg_min); #undef XMSGSIZE -static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { +const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA] = { .len = sizeof(struct xfrm_usersa_info)}, [XFRMA_POLICY] = { .len = sizeof(struct xfrm_userpolicy_info)}, [XFRMA_LASTUSED] = { .type = NLA_U64}, @@ -2481,6 +2538,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, }; +EXPORT_SYMBOL_GPL(xfrma_policy); static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, @@ -2529,6 +2587,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct net *net = sock_net(skb->sk); struct nlattr *attrs[XFRMA_MAX+1]; const struct xfrm_link *link; + struct nlmsghdr *nlh64 = NULL; int type, err; type = nlh->nlmsg_type; @@ -2542,32 +2601,58 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; + /* Use the 64-bit / untranslated format on Android, even for compat */ + if (!IS_ENABLED(CONFIG_ANDROID) || IS_ENABLED(CONFIG_XFRM_USER_COMPAT)) { + if (is_compat_task()) { + struct xfrm_translator *xtr = xfrm_get_translator(); + + if (!xtr) + return -EOPNOTSUPP; + + nlh64 = xtr->rcv_msg_compat(nlh, link->nla_max, + link->nla_pol); + xfrm_put_translator(xtr); + if (IS_ERR(nlh64)) + return PTR_ERR(nlh64); + if (nlh64) + nlh = nlh64; + } + } + if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && (nlh->nlmsg_flags & NLM_F_DUMP)) { - if (link->dump == NULL) - return -EINVAL; + struct netlink_dump_control c = { + .start = link->start, + .dump = link->dump, + .done = link->done, + }; - { - struct netlink_dump_control c = { - .start = link->start, - .dump = link->dump, - .done = link->done, - }; - return netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c); + if (link->dump == NULL) { + err = -EINVAL; + goto err; } + + err = netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c); + goto err; } err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, link->nla_max ? : XFRMA_MAX, link->nla_pol ? : xfrma_policy); if (err < 0) - return err; + goto err; - if (link->doit == NULL) - return -EINVAL; + if (link->doit == NULL) { + err = -EINVAL; + goto err; + } - return link->doit(skb, nlh, attrs); + err = link->doit(skb, nlh, attrs); + +err: + kvfree(nlh64); + return err; } static void xfrm_netlink_rcv(struct sk_buff *skb) |