From 7d3d43dab4e978d8d9ad1acf8af15c9b1c4b0f0f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Apr 2012 15:33:35 +0000 Subject: net: In unregister_netdevice_notifier unregister the netdevices. We already synthesize events in register_netdevice_notifier and synthesizing events in unregister_netdevice_notifier allows to us remove the need for special case cleanup code. This change should be safe as it adds no new cases for existing callers of unregiser_netdevice_notifier to handle. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c25d453b2803..9bb8f87c4cda 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1409,14 +1409,34 @@ EXPORT_SYMBOL(register_netdevice_notifier); * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { + struct net_device *dev; + struct net *net; int err; rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); + if (err) + goto unlock; + + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev->flags & IFF_UP) { + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); + nb->notifier_call(nb, NETDEV_DOWN, dev); + } + nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); + } + } +unlock: rtnl_unlock(); return err; } -- cgit v1.2.3 From 03478756b1b9298686ca9c8793e484ae39eb4649 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Apr 2012 15:35:39 +0000 Subject: phonet: Sort out initiailziation and cleanup code. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recently an oops was reported in phonet if there was a failure during network namespace creation. [ 163.733755] ------------[ cut here ]------------ [ 163.734501] kernel BUG at include/net/netns/generic.h:45! [ 163.734501] invalid opcode: 0000 [#1] PREEMPT SMP [ 163.734501] CPU 2 [ 163.734501] Pid: 19145, comm: trinity Tainted: G W 3.4.0-rc1-next-20120405-sasha-dirty #57 [ 163.734501] RIP: 0010:[] [] phonet_pernet+0x182/0x1a0 [ 163.734501] RSP: 0018:ffff8800674d5ca8 EFLAGS: 00010246 [ 163.734501] RAX: 000000003fffffff RBX: 0000000000000000 RCX: ffff8800678c88d8 [ 163.734501] RDX: 00000000003f4000 RSI: ffff8800678c8910 RDI: 0000000000000282 [ 163.734501] RBP: ffff8800674d5cc8 R08: 0000000000000000 R09: 0000000000000000 [ 163.734501] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880068bec920 [ 163.734501] R13: ffffffff836b90c0 R14: 0000000000000000 R15: 0000000000000000 [ 163.734501] FS: 00007f055e8de700(0000) GS:ffff88007d000000(0000) knlGS:0000000000000000 [ 163.734501] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 163.734501] CR2: 00007f055e6bb518 CR3: 0000000070c16000 CR4: 00000000000406e0 [ 163.734501] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 163.734501] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 163.734501] Process trinity (pid: 19145, threadinfo ffff8800674d4000, task ffff8800678c8000) [ 163.734501] Stack: [ 163.734501] ffffffff824d5f00 ffffffff810e2ec1 ffff880067ae0000 00000000ffffffd4 [ 163.734501] ffff8800674d5cf8 ffffffff824d667a ffff880067ae0000 00000000ffffffd4 [ 163.734501] ffffffff836b90c0 0000000000000000 ffff8800674d5d18 ffffffff824d707d [ 163.734501] Call Trace: [ 163.734501] [] ? phonet_pernet+0x20/0x1a0 [ 163.734501] [] ? get_parent_ip+0x11/0x50 [ 163.734501] [] phonet_device_destroy+0x1a/0x100 [ 163.734501] [] phonet_device_notify+0x3d/0x50 [ 163.734501] [] notifier_call_chain+0xee/0x130 [ 163.734501] [] raw_notifier_call_chain+0x11/0x20 [ 163.734501] [] call_netdevice_notifiers+0x52/0x60 [ 163.734501] [] rollback_registered_many+0x185/0x270 [ 163.734501] [] unregister_netdevice_many+0x14/0x60 [ 163.734501] [] ipip_exit_net+0x1b3/0x1d0 [ 163.734501] [] ? ipip_rcv+0x420/0x420 [ 163.734501] [] ops_exit_list+0x35/0x70 [ 163.734501] [] setup_net+0xab/0xe0 [ 163.734501] [] copy_net_ns+0x76/0x100 [ 163.734501] [] create_new_namespaces+0xfb/0x190 [ 163.734501] [] unshare_nsproxy_namespaces+0x61/0x80 [ 163.734501] [] sys_unshare+0xff/0x290 [ 163.734501] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 163.734501] [] system_call_fastpath+0x16/0x1b [ 163.734501] Code: e0 c3 fe 66 0f 1f 44 00 00 48 c7 c2 40 60 4d 82 be 01 00 00 00 48 c7 c7 80 d1 23 83 e8 48 2a c4 fe e8 73 06 c8 fe 48 85 db 75 0e <0f> 0b 0f 1f 40 00 eb fe 66 0f 1f 44 00 00 48 83 c4 10 48 89 d8 [ 163.734501] RIP [] phonet_pernet+0x182/0x1a0 [ 163.734501] RSP [ 163.861289] ---[ end trace fb5615826c548066 ]--- After investigation it turns out there were two issues. 1) Phonet was not implementing network devices but was using register_pernet_device instead of register_pernet_subsys. This was allowing there to be cases when phonenet was not initialized and the phonet net_generic was not set for a network namespace when network device events were being reported on the netdevice_notifier for a network namespace leading to the oops above. 2) phonet_exit_net was implementing a confusing and special case of handling all network devices from going away that it was hard to see was correct, and would only occur when the phonet module was removed. Now that unregister_netdevice_notifier has been modified to synthesize unregistration events for the network devices that are extant when called this confusing special case in phonet_exit_net is no longer needed. Signed-off-by: Eric W. Biederman Acked-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/pn_dev.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 9b9a85ecc4c7..bf5cf69c820a 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -331,23 +331,6 @@ static int __net_init phonet_init_net(struct net *net) static void __net_exit phonet_exit_net(struct net *net) { - struct phonet_net *pnn = phonet_pernet(net); - struct net_device *dev; - unsigned i; - - rtnl_lock(); - for_each_netdev(net, dev) - phonet_device_destroy(dev); - - for (i = 0; i < 64; i++) { - dev = pnn->routes.table[i]; - if (dev) { - rtm_phonet_notify(RTM_DELROUTE, dev, i); - dev_put(dev); - } - } - rtnl_unlock(); - proc_net_remove(net, "phonet"); } @@ -361,7 +344,7 @@ static struct pernet_operations phonet_net_ops = { /* Initialize Phonet devices list */ int __init phonet_device_init(void) { - int err = register_pernet_device(&phonet_net_ops); + int err = register_pernet_subsys(&phonet_net_ops); if (err) return err; @@ -377,7 +360,7 @@ void phonet_device_exit(void) { rtnl_unregister_all(PF_PHONET); unregister_netdevice_notifier(&phonet_device_notifier); - unregister_pernet_device(&phonet_net_ops); + unregister_pernet_subsys(&phonet_net_ops); proc_net_remove(&init_net, "pnresource"); } -- cgit v1.2.3 From 89eb06f11c314c2ab4ec59039715dc021933a7a0 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 8 Apr 2012 22:41:10 +0000 Subject: net/key/af_key.c: add missing kfree_skb At the point of this error-handling code, alloc_skb has succeded, so free the resulting skb by jumping to the err label. Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 11dbb2255ccb..7e5d927b576f 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3480,7 +3480,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, /* Addresses to be used by KM for negotiation, if ext is available */ if (k != NULL && (set_sadb_kmaddress(skb, k) < 0)) - return -EINVAL; + goto err; /* selector src */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel); -- cgit v1.2.3 From d1f224ae186b834af647661ffaf403a817c050ce Mon Sep 17 00:00:00 2001 From: James Chapman Date: Tue, 10 Apr 2012 00:10:42 +0000 Subject: l2tp: fix refcount leak in l2tp_ip sockets The l2tp_ip socket close handler does not update the module refcount correctly which prevents module unload after the first bind() call on an L2TPv3 IP encapulation socket. Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 55670ec3cd0f..b56be1452f8e 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -232,7 +232,7 @@ static void l2tp_ip_close(struct sock *sk, long timeout) { write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); - hlist_del_init(&sk->sk_node); + sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); sk_common_release(sk); } -- cgit v1.2.3 From c9be48dc8bb22f1f6e6ff1560b2b28e925a0b815 Mon Sep 17 00:00:00 2001 From: James Chapman Date: Tue, 10 Apr 2012 00:10:43 +0000 Subject: l2tp: don't overwrite source address in l2tp_ip_bind() Applications using L2TP/IP sockets want to be able to bind() an L2TP/IP socket to set the local tunnel id while leaving the auto-assigned source address alone. So if no source address is supplied, don't overwrite the address already stored in the socket. Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index b56be1452f8e..585d93ecee2d 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -271,7 +271,8 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; - inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr; + if (addr->l2tp_addr.s_addr) + inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); -- cgit v1.2.3 From 5c699fb7d88d360023f3b3f5291cbf5b59883a1b Mon Sep 17 00:00:00 2001 From: Tomasz Gregorek Date: Thu, 12 Apr 2012 08:18:07 +0000 Subject: caif: Fix memory leakage in the chnl_net.c. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added kfree_skb() calls in the chnk_net.c file on the error paths. Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 20618dd3088b..d09340e1523f 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -103,6 +103,7 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) skb->protocol = htons(ETH_P_IPV6); break; default: + kfree_skb(skb); priv->netdev->stats.rx_errors++; return -EINVAL; } @@ -220,14 +221,16 @@ static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->len > priv->netdev->mtu) { pr_warn("Size of skb exceeded MTU\n"); + kfree_skb(skb); dev->stats.tx_errors++; - return -ENOSPC; + return NETDEV_TX_OK; } if (!priv->flowenabled) { pr_debug("dropping packets flow off\n"); + kfree_skb(skb); dev->stats.tx_dropped++; - return NETDEV_TX_BUSY; + return NETDEV_TX_OK; } if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) @@ -242,7 +245,7 @@ static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) result = priv->chnl.dn->transmit(priv->chnl.dn, pkt); if (result) { dev->stats.tx_dropped++; - return result; + return NETDEV_TX_OK; } /* Update statistics. */ -- cgit v1.2.3 From 1716a96101c49186bb0b8491922fd3e69030235f Mon Sep 17 00:00:00 2001 From: Gao feng Date: Fri, 6 Apr 2012 00:13:10 +0000 Subject: ipv6: fix problem with expired dst cache If the ipv6 dst cache which copy from the dst generated by ICMPV6 RA packet. this dst cache will not check expire because it has no RTF_EXPIRES flag. So this dst cache will always be used until the dst gc run. Change the struct dst_entry,add a union contains new pointer from and expires. When rt6_info.rt6i_flags has no RTF_EXPIRES flag,the dst.expires has no use. we can use this field to point to where the dst cache copy from. The dst.from is only used in IPV6. rt6_check_expired check if rt6_info.dst.from is expired. ip6_rt_copy only set dst.from when the ort has flag RTF_ADDRCONF and RTF_DEFAULT.then hold the ort. ip6_dst_destroy release the ort. Add some functions to operate the RTF_EXPIRES flag and expires(from) together. and change the code to use these new adding functions. Changes from v5: modify ip6_route_add and ndisc_router_discovery to use new adding functions. Only set dst.from when the ort has flag RTF_ADDRCONF and RTF_DEFAULT.then hold the ort. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 9 +++---- net/ipv6/ip6_fib.c | 9 +++---- net/ipv6/ndisc.c | 3 +-- net/ipv6/route.c | 71 +++++++++++++++++++++++++++++++++-------------------- 4 files changed, 52 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6a3bb6077e19..7d5cb975cc6f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -803,8 +803,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) ip6_del_rt(rt); rt = NULL; } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { - rt->dst.expires = expires; - rt->rt6i_flags |= RTF_EXPIRES; + rt6_set_expires(rt, expires); } } dst_release(&rt->dst); @@ -1887,11 +1886,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ - rt->dst.expires = jiffies + rt_expires; - rt->rt6i_flags |= RTF_EXPIRES; + rt6_set_expires(rt, jiffies + rt_expires); } else { - rt->rt6i_flags &= ~RTF_EXPIRES; - rt->dst.expires = 0; + rt6_clean_expires(rt); } } else if (valid_lft) { clock_t expires = 0; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5b27fbcae346..93717435013e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -673,11 +673,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, &rt->rt6i_gateway)) { if (!(iter->rt6i_flags & RTF_EXPIRES)) return -EEXIST; - iter->dst.expires = rt->dst.expires; - if (!(rt->rt6i_flags & RTF_EXPIRES)) { - iter->rt6i_flags &= ~RTF_EXPIRES; - iter->dst.expires = 0; - } + if (!(rt->rt6i_flags & RTF_EXPIRES)) + rt6_clean_expires(iter); + else + rt6_set_expires(iter, rt->dst.expires); return -EEXIST; } } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3dcdb81ec3e8..176b469322ac 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1264,8 +1264,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } if (rt) - rt->dst.expires = jiffies + (HZ * lifetime); - + rt6_set_expires(rt, jiffies + (HZ * lifetime)); if (ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; if (rt) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3992e26a6039..bc4888d902b2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -62,7 +62,7 @@ #include #endif -static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, const struct in6_addr *dest); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); @@ -285,6 +285,10 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } + + if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from) + dst_release(dst->from); + if (peer) { rt->rt6i_peer = NULL; inet_putpeer(peer); @@ -329,8 +333,17 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static __inline__ int rt6_check_expired(const struct rt6_info *rt) { - return (rt->rt6i_flags & RTF_EXPIRES) && - time_after(jiffies, rt->dst.expires); + struct rt6_info *ort = NULL; + + if (rt->rt6i_flags & RTF_EXPIRES) { + if (time_after(jiffies, rt->dst.expires)) + return 1; + } else if (rt->dst.from) { + ort = (struct rt6_info *) rt->dst.from; + return (ort->rt6i_flags & RTF_EXPIRES) && + time_after(jiffies, ort->dst.expires); + } + return 0; } static inline int rt6_need_strict(const struct in6_addr *daddr) @@ -620,12 +633,11 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (!addrconf_finite_timeout(lifetime)) { - rt->rt6i_flags &= ~RTF_EXPIRES; - } else { - rt->dst.expires = jiffies + HZ * lifetime; - rt->rt6i_flags |= RTF_EXPIRES; - } + if (!addrconf_finite_timeout(lifetime)) + rt6_clean_expires(rt); + else + rt6_set_expires(rt, jiffies + HZ * lifetime); + dst_release(&rt->dst); } return 0; @@ -730,7 +742,7 @@ int ip6_ins_rt(struct rt6_info *rt) return __ip6_ins_rt(rt, &info); } -static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort, +static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -954,10 +966,10 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); - rt->dst.expires = 0; rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_flags = ort->rt6i_flags; + rt6_clean_expires(rt); rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -1019,10 +1031,9 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - dst_set_expires(&rt->dst, 0); - rt->rt6i_flags |= RTF_EXPIRES; - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + if (rt->rt6i_flags & RTF_CACHE) + rt6_update_expires(rt, 0); + else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) rt->rt6i_node->fn_sernum = -1; } } @@ -1289,9 +1300,12 @@ int ip6_route_add(struct fib6_config *cfg) } rt->dst.obsolete = -1; - rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ? - jiffies + clock_t_to_jiffies(cfg->fc_expires) : - 0; + + if (cfg->fc_flags & RTF_EXPIRES) + rt6_set_expires(rt, jiffies + + clock_t_to_jiffies(cfg->fc_expires)); + else + rt6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -1736,8 +1750,8 @@ again: features |= RTAX_FEATURE_ALLFRAG; dst_metric_set(&rt->dst, RTAX_FEATURES, features); } - dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); - rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; + rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); + rt->rt6i_flags |= RTF_MODIFIED; goto out; } @@ -1765,9 +1779,8 @@ again: * which is 10 mins. After 10 mins the decreased pmtu is expired * and detecting PMTU increase will be automatically happened. */ - dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); - nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; - + rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires); + nrt->rt6i_flags |= RTF_DYNAMIC; ip6_ins_rt(nrt); } out: @@ -1799,7 +1812,7 @@ void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *sad * Misc support functions */ -static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, const struct in6_addr *dest) { struct net *net = dev_net(ort->dst.dev); @@ -1819,10 +1832,14 @@ static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->dst.lastuse = jiffies; - rt->dst.expires = 0; rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_flags = ort->rt6i_flags; + if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) == + (RTF_DEFAULT | RTF_ADDRCONF)) + rt6_set_from(rt, ort); + else + rt6_clean_expires(rt); rt->rt6i_metric = 0; #ifdef CONFIG_IPV6_SUBTREES -- cgit v1.2.3 From e55a4046dab28c440c96890bdddcf02dc8981f2d Mon Sep 17 00:00:00 2001 From: Lukasz Kucharczyk Date: Wed, 11 Apr 2012 14:55:10 +0200 Subject: cfg80211: fix interface combinations check. Signed-off-by: Lukasz Kucharczyk Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- net/wireless/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 1b7a08df933c..957f25621617 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -989,7 +989,7 @@ int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, if (rdev->wiphy.software_iftypes & BIT(iftype)) continue; for (j = 0; j < c->n_limits; j++) { - if (!(limits[j].types & iftype)) + if (!(limits[j].types & BIT(iftype))) continue; if (limits[j].max < num[iftype]) goto cont; -- cgit v1.2.3 From 745c0ce35f904aeff8e1ea325c259a14a00ff1b7 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Fri, 13 Apr 2012 17:43:22 +0530 Subject: Bluetooth: hci_persistent_key should return bool This patch changes the return type of function hci_persistent_key from int to bool because it makes more sense to return information whether a key is persistent or not as a bool. Signed-off-by: Vishal Agarwal Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 21 +++++++++++---------- net/bluetooth/mgmt.c | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2054c1321c87..c2251e4c3b72 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1216,40 +1216,40 @@ struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) return NULL; } -static int hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, +static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, u8 key_type, u8 old_key_type) { /* Legacy key */ if (key_type < 0x03) - return 1; + return true; /* Debug keys are insecure so don't store them persistently */ if (key_type == HCI_LK_DEBUG_COMBINATION) - return 0; + return false; /* Changed combination key and there's no previous one */ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff) - return 0; + return false; /* Security mode 3 case */ if (!conn) - return 1; + return true; /* Neither local nor remote side had no-bonding as requirement */ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01) - return 1; + return true; /* Local side had dedicated bonding as requirement */ if (conn->auth_type == 0x02 || conn->auth_type == 0x03) - return 1; + return true; /* Remote side had dedicated bonding as requirement */ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) - return 1; + return true; /* If none of the above criteria match, then don't store the key * persistently */ - return 0; + return false; } struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, u8 rand[8]) @@ -1286,7 +1286,8 @@ int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len) { struct link_key *key, *old_key; - u8 old_key_type, persistent; + u8 old_key_type; + bool persistent; old_key = hci_find_link_key(hdev, bdaddr); if (old_key) { diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 4ef275c69675..4bb03b111122 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2884,7 +2884,7 @@ int mgmt_write_scan_failed(struct hci_dev *hdev, u8 scan, u8 status) return 0; } -int mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, u8 persistent) +int mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent) { struct mgmt_ev_new_link_key ev; -- cgit v1.2.3 From 6ec5bcadc21e13ceba8c144e4731eccac01d04f7 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Mon, 16 Apr 2012 14:44:44 +0530 Subject: Bluetooth: Temporary keys should be retained during connection If a key is non persistent then it should not be used in future connections but it should be kept for current connection. And it should be removed when connecion is removed. Signed-off-by: Vishal Agarwal Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 6 ++---- net/bluetooth/hci_event.c | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index c2251e4c3b72..a7607e4be347 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1330,10 +1330,8 @@ int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, mgmt_new_link_key(hdev, key, persistent); - if (!persistent) { - list_del(&key->list); - kfree(key); - } + if (conn) + conn->flush_key = !persistent; return 0; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index badb7851d116..6a72eaea70ee 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1902,6 +1902,8 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff } if (ev->status == 0) { + if (conn->type == ACL_LINK && conn->flush_key) + hci_remove_link_key(hdev, &conn->dst); hci_proto_disconn_cfm(conn, ev->reason); hci_conn_del(conn); } -- cgit v1.2.3 From 244b65dbfede788f2fa3fe2463c44d0809e97c6b Mon Sep 17 00:00:00 2001 From: David Ward Date: Sun, 15 Apr 2012 12:31:45 +0000 Subject: net_sched: gred: Fix oops in gred_dump() in WRED mode A parameter set exists for WRED mode, called wred_set, to hold the same values for qavg and qidlestart across all VQs. The WRED mode values had been previously held in the VQ for the default DP. After these values were moved to wred_set, the VQ for the default DP was no longer created automatically (so that it could be omitted on purpose, to have packets in the default DP enqueued directly to the device without using RED). However, gred_dump() was overlooked during that change; in WRED mode it still reads qavg/qidlestart from the VQ for the default DP, which might not even exist. As a result, this command sequence will cause an oops: tc qdisc add dev $DEV handle $HANDLE parent $PARENT gred setup \ DPs 3 default 2 grio tc qdisc change dev $DEV handle $HANDLE gred DP 0 prio 8 $RED_OPTIONS tc qdisc change dev $DEV handle $HANDLE gred DP 1 prio 8 $RED_OPTIONS This fixes gred_dump() in WRED mode to use the values held in wred_set. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 0b15236be7b6..8179494c269a 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -565,11 +565,8 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.packets = q->packetsin; opt.bytesin = q->bytesin; - if (gred_wred_mode(table)) { - q->vars.qidlestart = - table->tab[table->def]->vars.qidlestart; - q->vars.qavg = table->tab[table->def]->vars.qavg; - } + if (gred_wred_mode(table)) + gred_load_wred_set(table, q); opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg); -- cgit v1.2.3 From 973ef21a676e55a8e1a100a6e109f0c116ea75e8 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 16 Apr 2012 14:56:48 +0200 Subject: mac80211: fix truncated packets in cooked monitor rx Cooked monitor rx was recently changed to use ieee80211_add_rx_radiotap_header instead of generating only limited radiotap information. ieee80211_add_rx_radiotap_header assumes that FCS info is still present if the hardware supports receiving it, however when cooked monitor rx packets are processed, FCS info has already been stripped. Fix this by adding an extra flag indicating FCS presence. Signed-off-by: Felix Fietkau Signed-off-by: John W. Linville --- net/mac80211/rx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bcfe8c77c839..d64e285400aa 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -103,7 +103,7 @@ static void ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_rate *rate, - int rtap_len) + int rtap_len, bool has_fcs) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_radiotap_header *rthdr; @@ -134,7 +134,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } /* IEEE80211_RADIOTAP_FLAGS */ - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + if (has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)) *pos |= IEEE80211_RADIOTAP_F_FCS; if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) *pos |= IEEE80211_RADIOTAP_F_BADFCS; @@ -294,7 +294,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, } /* prepend radiotap information */ - ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom); + ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom, + true); skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -2571,7 +2572,8 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, goto out_free_skb; /* prepend radiotap information */ - ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom); + ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom, + false); skb_set_mac_header(skb, 0); skb->ip_summed = CHECKSUM_UNNECESSARY; -- cgit v1.2.3 From 6741e7f048dacc92e37c5d724ff5c64e45f6c2c9 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 16 Apr 2012 22:10:42 +0200 Subject: mac80211: fix logic error in ibss channel type check The broken check leads to rate control attempting to use HT40 while the driver is configured for HT20. This leads to interesting hardware issues. HT40 can only be used if the channel type is either HT40- or HT40+ and if the channel type of the cell matches the local type. Signed-off-by: Felix Fietkau Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- net/mac80211/ibss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 33fd8d9f714e..cef7c29214a8 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -457,8 +457,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, * fall back to HT20 if we don't use or use * the other extension channel */ - if ((channel_type == NL80211_CHAN_HT40MINUS || - channel_type == NL80211_CHAN_HT40PLUS) && + if (!(channel_type == NL80211_CHAN_HT40MINUS || + channel_type == NL80211_CHAN_HT40PLUS) || channel_type != sdata->u.ibss.channel_type) sta_ht_cap_new.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; -- cgit v1.2.3 From 4d846f02392a710f9604892ac3329e628e60a230 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2012 23:28:07 +0000 Subject: tcp: fix tcp_grow_window() for large incoming frames tcp_grow_window() has to grow rcv_ssthresh up to window_clamp, allowing sender to increase its window. tcp_grow_window() still assumes a tcp frame is under MSS, but its no longer true with LRO/GRO. This patch fixes one of the performance issue we noticed with GRO on. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Tom Herbert Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9944c1d9a218..3ff364065376 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -335,6 +335,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) incr = __tcp_grow_window(sk, skb); if (incr) { + incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; -- cgit v1.2.3 From b922934d017f1cc831b017913ed7d1a56c558b43 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 16 Apr 2012 04:43:15 +0000 Subject: netns: do not leak net_generic data on failed init ops_init should free the net_generic data on init failure and __register_pernet_operations should not call ops_free when NET_NS is not enabled. Signed-off-by: Julian Anastasov Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/net_namespace.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0e950fda9a0a..31a5ae51a45c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -83,21 +83,29 @@ assign: static int ops_init(const struct pernet_operations *ops, struct net *net) { - int err; + int err = -ENOMEM; + void *data = NULL; + if (ops->id && ops->size) { - void *data = kzalloc(ops->size, GFP_KERNEL); + data = kzalloc(ops->size, GFP_KERNEL); if (!data) - return -ENOMEM; + goto out; err = net_assign_generic(net, *ops->id, data); - if (err) { - kfree(data); - return err; - } + if (err) + goto cleanup; } + err = 0; if (ops->init) - return ops->init(net); - return 0; + err = ops->init(net); + if (!err) + return 0; + +cleanup: + kfree(data); + +out: + return err; } static void ops_free(const struct pernet_operations *ops, struct net *net) @@ -448,12 +456,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { - int err = 0; - err = ops_init(ops, &init_net); - if (err) - ops_free(ops, &init_net); - return err; - + return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) -- cgit v1.2.3 From adae0fe0ea87e8fb1a72dde304937c60759b495f Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 5 Apr 2012 21:04:37 +0400 Subject: SUNRPC: register PipeFS file system after pernet sybsystem PipeFS superblock creation routine relays on SUNRPC pernet data presense, which is created on register_pernet_subsys() call in SUNRPC module init function. Registering of PipeFS filesystem prior to registering of per-net subsystem leads to races (mount of PipeFS can dereference uninitialized data). Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/sunrpc_syms.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 8adfc88e793a..3d6498af9adc 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -75,19 +75,20 @@ static struct pernet_operations sunrpc_net_ops = { static int __init init_sunrpc(void) { - int err = register_rpc_pipefs(); + int err = rpc_init_mempool(); if (err) goto out; - err = rpc_init_mempool(); - if (err) - goto out2; err = rpcauth_init_module(); if (err) - goto out3; + goto out2; cache_initialize(); err = register_pernet_subsys(&sunrpc_net_ops); + if (err) + goto out3; + + err = register_rpc_pipefs(); if (err) goto out4; #ifdef RPC_DEBUG @@ -98,11 +99,11 @@ init_sunrpc(void) return 0; out4: - rpcauth_remove_module(); + unregister_pernet_subsys(&sunrpc_net_ops); out3: - rpc_destroy_mempool(); + rpcauth_remove_module(); out2: - unregister_rpc_pipefs(); + rpc_destroy_mempool(); out: return err; } -- cgit v1.2.3 From 22b4a4f22da4b39c6f7f679fd35f3d35c91bf851 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Apr 2012 10:14:23 +0000 Subject: tcp: fix retransmit of partially acked frames Alexander Beregalov reported skb_over_panic errors and provided stack trace. I occurs commit a21d45726aca (tcp: avoid order-1 allocations on wifi and tx path) added a regression, when a retransmit is done after a partial ACK. tcp_retransmit_skb() tries to aggregate several frames if the first one has enough available room to hold the following ones payload. This is controlled by /proc/sys/net/ipv4/tcp_retrans_collapse tunable (default : enabled) Problem is we must make sure _pskb_trim_head() doesnt fool skb_availroom() when pulling some bytes from skb (this pull is done when receiver ACK part of the frame). Reported-by: Alexander Beregalov Cc: Marc MERLIN Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 376b2cfbb685..7ac6423117ad 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1096,6 +1096,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); + skb->avail_size -= eat; len -= eat; if (!len) return; -- cgit v1.2.3 From 3adadc08cc1e2cbcc15a640d639297ef5fcb17f5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 Apr 2012 16:11:23 +0000 Subject: net ax25: Reorder ax25_exit to remove races. While reviewing the sysctl code in ax25 I spotted races in ax25_exit where it is possible to receive notifications and packets after already freeing up some of the data structures needed to process those notifications and updates. Call unregister_netdevice_notifier early so that the rest of the cleanup code does not need to deal with network devices. This takes advantage of my recent enhancement to unregister_netdevice_notifier to send unregister notifications of all network devices that are current registered. Move the unregistration for packet types, socket types and protocol types before we cleanup any of the ax25 data structures to remove the possibilities of other races. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 0906c194a413..9d9a6a3edbd5 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -2011,16 +2011,17 @@ static void __exit ax25_exit(void) proc_net_remove(&init_net, "ax25_route"); proc_net_remove(&init_net, "ax25"); proc_net_remove(&init_net, "ax25_calls"); - ax25_rt_free(); - ax25_uid_free(); - ax25_dev_free(); - ax25_unregister_sysctl(); unregister_netdevice_notifier(&ax25_dev_notifier); + ax25_unregister_sysctl(); dev_remove_pack(&ax25_packet_type); sock_unregister(PF_AX25); proto_unregister(&ax25_proto); + + ax25_rt_free(); + ax25_uid_free(); + ax25_dev_free(); } module_exit(ax25_exit); -- cgit v1.2.3 From bbe362be5368b9f531b95a4a9b502ae2832e1dac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Apr 2012 07:16:21 +0000 Subject: drop_monitor: allow more events per second It seems there is a logic error in trace_drop_common(), since we store only 64 drops, even if they are from same location. This fix is a one liner, but we probably need more work to avoid useless atomic dec/inc Now I can watch 1 Mpps drops through dropwatch... Signed-off-by: Eric Dumazet Cc: Neil Horman Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 7f36b38e060f..5c3c81a609e5 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -150,6 +150,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { msg->points[i].count++; + atomic_inc(&data->dm_hit_count); goto out; } } -- cgit v1.2.3 From d135c522f1234f62e81be29cebdf59e9955139ad Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 22 Apr 2012 09:45:47 +0000 Subject: tcp: fix TCP_MAXSEG for established IPv6 passive sockets Commit f5fff5d forgot to fix TCP_MAXSEG behavior IPv6 sockets, so IPv6 TCP server sockets that used TCP_MAXSEG would find that the advmss of child sockets would be incorrect. This commit mirrors the advmss logic from tcp_v4_syn_recv_sock in tcp_v6_syn_recv_sock. Eventually this logic should probably be shared between IPv4 and IPv6, but this at least fixes this issue. Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 86cfe6005f40..98256cf72f9d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1383,6 +1383,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + tcp_initialize_rcv_mss(newsk); if (tcp_rsk(req)->snt_synack) tcp_valid_rtt_meas(newsk, -- cgit v1.2.3 From a881e963c7fe1f226e991ee9bbe8907acda93294 Mon Sep 17 00:00:00 2001 From: "Peter Huang (Peng)" Date: Thu, 19 Apr 2012 20:12:51 +0000 Subject: set fake_rtable's dst to NULL to avoid kernel Oops bridge: set fake_rtable's dst to NULL to avoid kernel Oops when bridge is deleted before tap/vif device's delete, kernel may encounter an oops because of NULL reference to fake_rtable's dst. Set fake_rtable's dst to NULL before sending packets out can solve this problem. v4 reformat, change br_drop_fake_rtable(skb) to {} v3 enrich commit header v2 introducing new flag DST_FAKE_RTABLE to dst_entry struct. [ Use "do { } while (0)" for nop br_drop_fake_rtable() implementation -DaveM ] Acked-by: Eric Dumazet Signed-off-by: Peter Huang Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 1 + net/bridge/br_netfilter.c | 8 ++------ 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 61f65344e711..a2098e3de500 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -47,6 +47,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) kfree_skb(skb); } else { skb_push(skb, ETH_HLEN); + br_drop_fake_rtable(skb); dev_queue_xmit(skb); } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index dec4f3817133..d7f49b63ab0f 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -156,7 +156,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) rt->dst.dev = br->dev; rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_NOPEER; + rt->dst.flags = DST_NOXFRM | DST_NOPEER | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } @@ -694,11 +694,7 @@ static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff *skb, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct rtable *rt = skb_rtable(skb); - - if (rt && rt == bridge_parent_rtable(in)) - skb_dst_drop(skb); - + br_drop_fake_rtable(skb); return NF_ACCEPT; } -- cgit v1.2.3 From 16cde9931bcd8d8ca968ef1cded02684ea040374 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Fri, 13 Apr 2012 12:32:42 +0200 Subject: Bluetooth: Fix missing break in hci_cmd_complete_evt Command complete event for HCI_OP_USER_PASSKEY_NEG_REPLY would result in calling handler function also for HCI_OP_LE_SET_SCAN_PARAM. This could result in undefined behaviour. Signed-off-by: Szymon Janc Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6a72eaea70ee..7f87a70b8618 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2314,6 +2314,7 @@ static inline void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *sk case HCI_OP_USER_PASSKEY_NEG_REPLY: hci_cc_user_passkey_neg_reply(hdev, skb); + break; case HCI_OP_LE_SET_SCAN_PARAM: hci_cc_le_set_scan_param(hdev, skb); -- cgit v1.2.3 From afa762f6871a8cb05fbef5d0f83fac14304aa816 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Mon, 23 Apr 2012 14:45:15 +0300 Subject: mac80211: call ieee80211_mgd_stop() on interface stop ieee80211_mgd_teardown() is called on netdev removal, which occurs after the vif was already removed from the low-level driver, resulting in the following warning: [ 4809.014734] ------------[ cut here ]------------ [ 4809.019861] WARNING: at net/mac80211/driver-ops.h:12 ieee80211_bss_info_change_notify+0x200/0x2c8 [mac80211]() [ 4809.030388] wlan0: Failed check-sdata-in-driver check, flags: 0x4 [ 4809.036862] Modules linked in: wlcore_sdio(-) wl12xx wlcore mac80211 cfg80211 [last unloaded: cfg80211] [ 4809.046849] [] (unwind_backtrace+0x0/0x12c) [ 4809.055937] [] (dump_stack+0x20/0x24) [ 4809.065385] [] (warn_slowpath_common+0x5c/0x74) [ 4809.075589] [] (warn_slowpath_fmt+0x40/0x48) [ 4809.088291] [] (ieee80211_bss_info_change_notify+0x200/0x2c8 [mac80211]) [ 4809.102844] [] (ieee80211_destroy_auth_data+0x80/0xa4 [mac80211]) [ 4809.116276] [] (ieee80211_mgd_teardown+0x5c/0x74 [mac80211]) [ 4809.129331] [] (ieee80211_teardown_sdata+0xb0/0xd8 [mac80211]) [ 4809.141595] [] (rollback_registered_many+0x228/0x2f0) [ 4809.153056] [] (unregister_netdevice_many+0x28/0x50) [ 4809.165696] [] (ieee80211_remove_interfaces+0xb4/0xdc [mac80211]) [ 4809.179151] [] (ieee80211_unregister_hw+0x50/0xf0 [mac80211]) [ 4809.191043] [] (wlcore_remove+0x5c/0x7c [wlcore]) [ 4809.201491] [] (platform_drv_remove+0x24/0x28) [ 4809.212029] [] (__device_release_driver+0x8c/0xcc) [ 4809.222738] [] (device_release_driver+0x30/0x3c) [ 4809.233099] [] (bus_remove_device+0x10c/0x128) [ 4809.242620] [] (device_del+0x11c/0x17c) [ 4809.252150] [] (platform_device_del+0x28/0x68) [ 4809.263051] [] (wl1271_remove+0x3c/0x50 [wlcore_sdio]) [ 4809.273590] [] (sdio_bus_remove+0x48/0xf8) [ 4809.283754] [] (__device_release_driver+0x8c/0xcc) [ 4809.293729] [] (driver_detach+0x9c/0xc4) [ 4809.303163] [] (bus_remove_driver+0xc4/0xf4) [ 4809.312973] [] (driver_unregister+0x70/0x7c) [ 4809.323220] [] (sdio_unregister_driver+0x24/0x2c) [ 4809.334213] [] (wl1271_exit+0x14/0x1c [wlcore_sdio]) [ 4809.344930] [] (sys_delete_module+0x228/0x2a8) [ 4809.354734] ---[ end trace 515290ccf5feb522 ]--- Rename ieee80211_mgd_teardown() to ieee80211_mgd_stop(), and call it on ieee80211_do_stop(). Signed-off-by: Eliad Peller Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/iface.c | 4 ++-- net/mac80211/mlme.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d9798a307f20..db8fae51714c 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1210,7 +1210,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); -void ieee80211_mgd_teardown(struct ieee80211_sub_if_data *sdata); +void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 401c01f0731e..c20051b7ffcd 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -486,6 +486,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, /* free all potentially still buffered bcast frames */ local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps_bc_buf); skb_queue_purge(&sdata->u.ap.ps_bc_buf); + } else if (sdata->vif.type == NL80211_IFTYPE_STATION) { + ieee80211_mgd_stop(sdata); } if (going_down) @@ -644,8 +646,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev) if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_rmc_free(sdata); - else if (sdata->vif.type == NL80211_IFTYPE_STATION) - ieee80211_mgd_teardown(sdata); flushed = sta_info_flush(local, sdata); WARN_ON(flushed); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f76da5b3f5c5..20c680bfc3ae 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3497,7 +3497,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, return 0; } -void ieee80211_mgd_teardown(struct ieee80211_sub_if_data *sdata) +void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; -- cgit v1.2.3 From 7118c07a844d367560ee91adb2071bde2fabcdbf Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 14 Apr 2012 12:37:46 -0400 Subject: ipvs: Verify that IP_VS protocol has been registered The registration of a protocol might fail, there were no checks and all registrations were assumed to be correct. This lead to NULL ptr dereferences when apps tried registering. For example: [ 1293.226051] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [ 1293.227038] IP: [] tcp_register_app+0x60/0xb0 [ 1293.227038] PGD 391de067 PUD 6c20b067 PMD 0 [ 1293.227038] Oops: 0000 [#1] PREEMPT SMP [ 1293.227038] CPU 1 [ 1293.227038] Pid: 19609, comm: trinity Tainted: G W 3.4.0-rc1-next-20120405-sasha-dirty #57 [ 1293.227038] RIP: 0010:[] [] tcp_register_app+0x60/0xb0 [ 1293.227038] RSP: 0018:ffff880038c1dd18 EFLAGS: 00010286 [ 1293.227038] RAX: ffffffffffffffc0 RBX: 0000000000001500 RCX: 0000000000010000 [ 1293.227038] RDX: 0000000000000000 RSI: ffff88003a2d5888 RDI: 0000000000000282 [ 1293.227038] RBP: ffff880038c1dd48 R08: 0000000000000000 R09: 0000000000000000 [ 1293.227038] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a2d5668 [ 1293.227038] R13: ffff88003a2d5988 R14: ffff8800696a8ff8 R15: 0000000000000000 [ 1293.227038] FS: 00007f01930d9700(0000) GS:ffff88007ce00000(0000) knlGS:0000000000000000 [ 1293.227038] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1293.227038] CR2: 0000000000000018 CR3: 0000000065dfc000 CR4: 00000000000406e0 [ 1293.227038] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1293.227038] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1293.227038] Process trinity (pid: 19609, threadinfo ffff880038c1c000, task ffff88002dc73000) [ 1293.227038] Stack: [ 1293.227038] ffff880038c1dd48 00000000fffffff4 ffff8800696aada0 ffff8800694f5580 [ 1293.227038] ffffffff8369f1e0 0000000000001500 ffff880038c1dd98 ffffffff822a716b [ 1293.227038] 0000000000000000 ffff8800696a8ff8 0000000000000015 ffff8800694f5580 [ 1293.227038] Call Trace: [ 1293.227038] [] ip_vs_app_inc_new+0xdb/0x180 [ 1293.227038] [] register_ip_vs_app_inc+0x48/0x70 [ 1293.227038] [] __ip_vs_ftp_init+0xba/0x140 [ 1293.227038] [] ops_init+0x80/0x90 [ 1293.227038] [] setup_net+0x5b/0xe0 [ 1293.227038] [] copy_net_ns+0x76/0x100 [ 1293.227038] [] create_new_namespaces+0xfb/0x190 [ 1293.227038] [] unshare_nsproxy_namespaces+0x61/0x80 [ 1293.227038] [] sys_unshare+0xff/0x290 [ 1293.227038] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 1293.227038] [] system_call_fastpath+0x16/0x1b [ 1293.227038] Code: 89 c7 e8 34 91 3b 00 89 de 66 c1 ee 04 31 de 83 e6 0f 48 83 c6 22 48 c1 e6 04 4a 8b 14 26 49 8d 34 34 48 8d 42 c0 48 39 d6 74 13 <66> 39 58 58 74 22 48 8b 48 40 48 8d 41 c0 48 39 ce 75 ed 49 8d [ 1293.227038] RIP [] tcp_register_app+0x60/0xb0 [ 1293.227038] RSP [ 1293.227038] CR2: 0000000000000018 [ 1293.379284] ---[ end trace 364ab40c7011a009 ]--- [ 1293.381182] Kernel panic - not syncing: Fatal exception in interrupt Signed-off-by: Sasha Levin Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index f843a8833250..a62360e29037 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -59,9 +59,6 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) return 0; } -#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \ - defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \ - defined(CONFIG_IP_VS_PROTO_ESP) /* * register an ipvs protocols netns related data */ @@ -86,7 +83,6 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) return 0; } -#endif /* * unregister an ipvs protocol @@ -316,22 +312,35 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, */ int __net_init ip_vs_protocol_net_init(struct net *net) { + int i, ret; + static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP - register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); + &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP - register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); + &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP - register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); + &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH - register_ip_vs_proto_netns(net, &ip_vs_protocol_ah); + &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP - register_ip_vs_proto_netns(net, &ip_vs_protocol_esp); + &ip_vs_protocol_esp, #endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(net, protos[i]); + if (ret < 0) + goto cleanup; + } return 0; + +cleanup: + ip_vs_protocol_net_cleanup(net); + return ret; } void __net_exit ip_vs_protocol_net_cleanup(struct net *net) -- cgit v1.2.3 From 8f9b9a2fad47af27e14b037395e03cd8278d96d7 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 13 Apr 2012 18:08:43 +0300 Subject: ipvs: fix crash in ip_vs_control_net_cleanup on unload commit 14e405461e664b777e2a5636e10b2ebf36a686ec (2.6.39) ("Add __ip_vs_control_{init,cleanup}_sysctl()") introduced regression due to wrong __net_init for __ip_vs_control_cleanup_sysctl. This leads to crash when the ip_vs module is unloaded. Fix it by changing __net_init to __net_exit for the function that is already renamed to ip_vs_control_net_cleanup_sysctl. Signed-off-by: Julian Anastasov Signed-off-by: Hans Schillstrom Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b3afe189af61..376d2b12d581 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3680,7 +3680,7 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) return 0; } -void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) +void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -3692,7 +3692,7 @@ void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) #else int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } -void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) { } +void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } #endif -- cgit v1.2.3 From 62ad6fcd743792bf294f2a7ba26ab8f462065150 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 24 Apr 2012 18:15:41 +0000 Subject: udp_diag: implement idiag_get_info for udp/udplite to get queue information When we use netlink to monitor queue information for udp socket, idiag_rqueue and idiag_wqueue of inet_diag_msg are returned with 0. Keep consistent with netstat, just return back allocated rmem/wmem size. Signed-off-by: Shan Wei Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 2 +- net/ipv4/udp_diag.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8d25a1c557eb..8f8db724bfaf 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -141,7 +141,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto rtattr_failure; if (icsk == NULL) { - r->idiag_rqueue = r->idiag_wqueue = 0; + handler->idiag_get_info(sk, r, NULL); goto out; } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 8a949f19deb6..a7f86a3cd502 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -146,9 +146,17 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, return udp_dump_one(&udp_table, in_skb, nlh, req); } +static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); +} + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, + .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, }; @@ -167,6 +175,7 @@ static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr * static const struct inet_diag_handler udplite_diag_handler = { .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, + .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, }; -- cgit v1.2.3 From 8d08d71ce59438a6ef06be5db07966e0c144b74e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 25 Apr 2012 00:29:59 +0300 Subject: ipvs: add check in ftp for initialized core Avoid crash when registering ip_vs_ftp after the IPVS core initialization for netns fails. Do this by checking for present core (net->ipvs). Signed-off-by: Julian Anastasov Acked-by: Hans Schillstrom Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ftp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 538d74ee4f68..e39f693dd3e4 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -439,6 +439,8 @@ static int __net_init __ip_vs_ftp_init(struct net *net) struct ip_vs_app *app; struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL); if (!app) return -ENOMEM; -- cgit v1.2.3 From 39f618b4fd95ae243d940ec64c961009c74e3333 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 25 Apr 2012 00:29:58 +0300 Subject: ipvs: reset ipvs pointer in netns Make sure net->ipvs is reset on netns cleanup or failed initialization. It is needed for IPVS applications to know that IPVS core is not loaded in netns. Signed-off-by: Julian Anastasov Acked-by: Hans Schillstrom Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2555816e7788..260b9ef88775 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1924,6 +1924,7 @@ protocol_fail: control_fail: ip_vs_estimator_net_cleanup(net); estimator_fail: + net->ipvs = NULL; return -ENOMEM; } @@ -1936,6 +1937,7 @@ static void __net_exit __ip_vs_cleanup(struct net *net) ip_vs_control_net_cleanup(net); ip_vs_estimator_net_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + net->ipvs = NULL; } static void __net_exit __ip_vs_dev_cleanup(struct net *net) -- cgit v1.2.3 From 0848e4043014631d792a66266d6d7d64a7f21da5 Mon Sep 17 00:00:00 2001 From: "alex.bluesman.smirnov@gmail.com" Date: Wed, 25 Apr 2012 23:24:56 +0000 Subject: 6lowpan: fix segmentation fault caused by mlme request Add nescesary mlme callbacks to satisfy "iz list" request from user space. Due to 6lowpan device doesn't have its own phy, mlme implemented as a pipe to a real phy to which 6lowpan is attached. Signed-off-by: Alexander Smirnov Signed-off-by: David S. Miller --- net/ieee802154/6lowpan.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index 368515885368..f6b169439b5b 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -1044,6 +1044,24 @@ static void lowpan_dev_free(struct net_device *dev) free_netdev(dev); } +static struct wpan_phy *lowpan_get_phy(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_phy(real_dev); +} + +static u16 lowpan_get_pan_id(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); +} + +static u16 lowpan_get_short_addr(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); +} + static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; @@ -1053,6 +1071,12 @@ static const struct net_device_ops lowpan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, }; +static struct ieee802154_mlme_ops lowpan_mlme = { + .get_pan_id = lowpan_get_pan_id, + .get_phy = lowpan_get_phy, + .get_short_addr = lowpan_get_short_addr, +}; + static void lowpan_setup(struct net_device *dev) { pr_debug("(%s)\n", __func__); @@ -1070,6 +1094,7 @@ static void lowpan_setup(struct net_device *dev) dev->netdev_ops = &lowpan_netdev_ops; dev->header_ops = &lowpan_header_ops; + dev->ml_priv = &lowpan_mlme; dev->destructor = lowpan_dev_free; } -- cgit v1.2.3 From 8deff4af8745561efd2be34ee30cc57a6f0107c6 Mon Sep 17 00:00:00 2001 From: "alex.bluesman.smirnov@gmail.com" Date: Wed, 25 Apr 2012 23:24:57 +0000 Subject: 6lowpan: clean up fragments list if module unloaded Clean all the pending fragments and relative timers if 6lowpan link is going to be deleted. Signed-off-by: Alexander Smirnov Signed-off-by: David S. Miller --- net/ieee802154/6lowpan.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index f6b169439b5b..b3021f765204 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -1177,11 +1177,20 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head) { struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev); struct net_device *real_dev = lowpan_dev->real_dev; - struct lowpan_dev_record *entry; - struct lowpan_dev_record *tmp; + struct lowpan_dev_record *entry, *tmp; + struct lowpan_fragment *frame, *tframe; ASSERT_RTNL(); + spin_lock(&flist_lock); + list_for_each_entry_safe(frame, tframe, &lowpan_fragments, list) { + del_timer(&frame->timer); + list_del(&frame->list); + dev_kfree_skb(frame->skb); + kfree(frame); + } + spin_unlock(&flist_lock); + mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { if (entry->ldev == dev) { -- cgit v1.2.3 From 768f7c7c121e80f458a9d013b2e8b169e5dfb1e5 Mon Sep 17 00:00:00 2001 From: "alex.bluesman.smirnov@gmail.com" Date: Wed, 25 Apr 2012 23:24:58 +0000 Subject: 6lowpan: add missing spin_lock_init() Add missing spin_lock_init() for frames list lock. Signed-off-by: Alexander Smirnov Signed-off-by: David S. Miller --- net/ieee802154/6lowpan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index b3021f765204..840821b90bcd 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -1168,6 +1168,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, list_add_tail(&entry->list, &lowpan_devices); mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); + spin_lock_init(&flist_lock); + register_netdevice(dev); return 0; -- cgit v1.2.3 From 651913ce9de2bbcedef608c5d6cf39c244248509 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 27 Apr 2012 11:29:37 -0400 Subject: tcp: clean up use of jiffies in tcp_rcv_rtt_measure() Clean up a reference to jiffies in tcp_rcv_rtt_measure() that should instead reference tcp_time_stamp. Since the result of the subtraction is passed into a function taking u32, this should not change any behavior (and indeed the generated assembly does not change on x86_64). However, it seems worth cleaning this up for consistency and clarity (and perhaps to avoid bugs if this is copied and pasted somewhere else). Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ff364065376..2a702e3a4ae6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -495,7 +495,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); + tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; -- cgit v1.2.3 From a4dff1bc492ee4a2184d384ae8b5bcab5859e150 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 20 Apr 2012 18:11:02 +0400 Subject: SUNRPC: skip dead but not buried clients on PipeFS events These clients can't be safely dereferenced if their counter in 0. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 67972462a543..d10ebc4310f7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -218,7 +218,8 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) if (((event == RPC_PIPEFS_MOUNT) && clnt->cl_dentry) || ((event == RPC_PIPEFS_UMOUNT) && !clnt->cl_dentry)) continue; - atomic_inc(&clnt->cl_count); + if (atomic_inc_not_zero(&clnt->cl_count) == 0) + continue; spin_unlock(&sn->rpc_client_lock); return clnt; } -- cgit v1.2.3 From 7aab449e5a2ebfa9c5116e87e16536bc4195e4de Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 20 Apr 2012 18:19:18 +0400 Subject: SUNRPC: skip clients with program without PipeFS entries 1) This is sane. 2) Otherwise there will be soft lockup: do { rpc_get_client_for_event (clnt->cl_dentry == NULL ==> choose) __rpc_pipefs_event (clnt->cl_program->pipe_dir_name == NULL ==> return) } while (1) Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d10ebc4310f7..8a19849ff340 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -184,8 +184,6 @@ static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, switch (event) { case RPC_PIPEFS_MOUNT: - if (clnt->cl_program->pipe_dir_name == NULL) - break; dentry = rpc_setup_pipedir_sb(sb, clnt, clnt->cl_program->pipe_dir_name); BUG_ON(dentry == NULL); @@ -215,6 +213,8 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) spin_lock(&sn->rpc_client_lock); list_for_each_entry(clnt, &sn->all_clients, cl_clients) { + if (clnt->cl_program->pipe_dir_name == NULL) + break; if (((event == RPC_PIPEFS_MOUNT) && clnt->cl_dentry) || ((event == RPC_PIPEFS_UMOUNT) && !clnt->cl_dentry)) continue; -- cgit v1.2.3 From 37629b572cc4e80fc24b4139a24df1a89415d534 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 20 Apr 2012 18:19:56 +0400 Subject: SUNRPC: set per-net PipeFS superblock before notification There can be a case, when on MOUNT event RPC client (after it's dentries were created) is not longer hold by anyone except notification callback. I.e. on release this client will be destoroyed. And it's dentries have to be destroyed as well. Which in turn requires per-net PipeFS superblock to be set. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 0af37fc46818..3b62cf288031 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1126,19 +1126,20 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", net, NET_NAME(net)); + sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, sb); if (err) goto err_depopulate; sb->s_fs_info = get_net(net); - sn->pipefs_sb = sb; return 0; err_depopulate: blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); + sn->pipefs_sb = NULL; __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); return err; } -- cgit v1.2.3 From ea8cfa06795bb30d2ea61f503ef129284492c06a Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 27 Apr 2012 13:00:17 +0400 Subject: SUNRPC: traverse clients tree on PipeFS event v2: recursion was replaced by loop If client is a clone, then it's parent can not be in the list. But parent's Pipefs dentries have to be created and destroyed. Note: event skip helper for clients introduced Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8a19849ff340..d127bd747527 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -176,8 +176,16 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, const char *dir_name) return 0; } -static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, - struct super_block *sb) +static inline int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) +{ + if (((event == RPC_PIPEFS_MOUNT) && clnt->cl_dentry) || + ((event == RPC_PIPEFS_UMOUNT) && !clnt->cl_dentry)) + return 1; + return 0; +} + +static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, + struct super_block *sb) { struct dentry *dentry; int err = 0; @@ -206,6 +214,20 @@ static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, return err; } +static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, + struct super_block *sb) +{ + int error = 0; + + for (;; clnt = clnt->cl_parent) { + if (!rpc_clnt_skip_event(clnt, event)) + error = __rpc_clnt_handle_event(clnt, event, sb); + if (error || clnt == clnt->cl_parent) + break; + } + return error; +} + static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); @@ -215,8 +237,7 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) list_for_each_entry(clnt, &sn->all_clients, cl_clients) { if (clnt->cl_program->pipe_dir_name == NULL) break; - if (((event == RPC_PIPEFS_MOUNT) && clnt->cl_dentry) || - ((event == RPC_PIPEFS_UMOUNT) && !clnt->cl_dentry)) + if (rpc_clnt_skip_event(clnt, event)) continue; if (atomic_inc_not_zero(&clnt->cl_count) == 0) continue; -- cgit v1.2.3 From cde2e9a651b76d8db36ae94cd0febc82b637e5dd Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 27 Apr 2012 10:11:48 +0000 Subject: drop_monitor: fix sleeping in invalid context warning Eric Dumazet pointed out this warning in the drop_monitor protocol to me: [ 38.352571] BUG: sleeping function called from invalid context at kernel/mutex.c:85 [ 38.352576] in_atomic(): 1, irqs_disabled(): 0, pid: 4415, name: dropwatch [ 38.352580] Pid: 4415, comm: dropwatch Not tainted 3.4.0-rc2+ #71 [ 38.352582] Call Trace: [ 38.352592] [] ? trace_napi_poll_hit+0xd0/0xd0 [ 38.352599] [] __might_sleep+0xca/0xf0 [ 38.352606] [] mutex_lock+0x26/0x50 [ 38.352610] [] ? trace_napi_poll_hit+0xd0/0xd0 [ 38.352616] [] tracepoint_probe_register+0x29/0x90 [ 38.352621] [] set_all_monitor_traces+0x105/0x170 [ 38.352625] [] net_dm_cmd_trace+0x2a/0x40 [ 38.352630] [] genl_rcv_msg+0x21a/0x2b0 [ 38.352636] [] ? zone_statistics+0x99/0xc0 [ 38.352640] [] ? genl_rcv+0x30/0x30 [ 38.352645] [] netlink_rcv_skb+0xa9/0xd0 [ 38.352649] [] genl_rcv+0x20/0x30 [ 38.352653] [] netlink_unicast+0x1ae/0x1f0 [ 38.352658] [] netlink_sendmsg+0x2b6/0x310 [ 38.352663] [] sock_sendmsg+0x10f/0x130 [ 38.352668] [] ? move_addr_to_kernel+0x60/0xb0 [ 38.352673] [] ? verify_iovec+0x64/0xe0 [ 38.352677] [] __sys_sendmsg+0x386/0x390 [ 38.352682] [] ? handle_mm_fault+0x139/0x210 [ 38.352687] [] ? do_page_fault+0x1ec/0x4f0 [ 38.352693] [] ? set_next_entity+0x9d/0xb0 [ 38.352699] [] ? tty_ldisc_deref+0x9/0x10 [ 38.352703] [] ? pick_next_task_fair+0x63/0x140 [ 38.352708] [] sys_sendmsg+0x44/0x80 [ 38.352713] [] system_call_fastpath+0x16/0x1b It stems from holding a spinlock (trace_state_lock) while attempting to register or unregister tracepoint hooks, making in_atomic() true in this context, leading to the warning when the tracepoint calls might_sleep() while its taking a mutex. Since we only use the trace_state_lock to prevent trace protocol state races, as well as hardware stat list updates on an rcu write side, we can just convert the spinlock to a mutex to avoid this problem. Signed-off-by: Neil Horman Reported-by: Eric Dumazet CC: David Miller Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 5c3c81a609e5..a221a5bbecf7 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -42,7 +42,7 @@ static void send_dm_alert(struct work_struct *unused); * netlink alerts */ static int trace_state = TRACE_OFF; -static DEFINE_SPINLOCK(trace_state_lock); +static DEFINE_MUTEX(trace_state_mutex); struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -214,7 +214,7 @@ static int set_all_monitor_traces(int state) struct dm_hw_stat_delta *new_stat = NULL; struct dm_hw_stat_delta *temp; - spin_lock(&trace_state_lock); + mutex_lock(&trace_state_mutex); if (state == trace_state) { rc = -EAGAIN; @@ -253,7 +253,7 @@ static int set_all_monitor_traces(int state) rc = -EINPROGRESS; out_unlock: - spin_unlock(&trace_state_lock); + mutex_unlock(&trace_state_mutex); return rc; } @@ -296,12 +296,12 @@ static int dropmon_net_event(struct notifier_block *ev_block, new_stat->dev = dev; new_stat->last_rx = jiffies; - spin_lock(&trace_state_lock); + mutex_lock(&trace_state_mutex); list_add_rcu(&new_stat->list, &hw_stats_list); - spin_unlock(&trace_state_lock); + mutex_unlock(&trace_state_mutex); break; case NETDEV_UNREGISTER: - spin_lock(&trace_state_lock); + mutex_lock(&trace_state_mutex); list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { if (new_stat->dev == dev) { new_stat->dev = NULL; @@ -312,7 +312,7 @@ static int dropmon_net_event(struct notifier_block *ev_block, } } } - spin_unlock(&trace_state_lock); + mutex_unlock(&trace_state_mutex); break; } out: -- cgit v1.2.3 From 3885ca785a3618593226687ced84f3f336dc3860 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 27 Apr 2012 10:11:49 +0000 Subject: drop_monitor: Make updating data->skb smp safe Eric Dumazet pointed out to me that the drop_monitor protocol has some holes in its smp protections. Specifically, its possible to replace data->skb while its being written. This patch corrects that by making data->skb an rcu protected variable. That will prevent it from being overwritten while a tracepoint is modifying it. Signed-off-by: Neil Horman Reported-by: Eric Dumazet CC: David Miller Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 70 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index a221a5bbecf7..7592943513e3 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -46,7 +46,7 @@ static DEFINE_MUTEX(trace_state_mutex); struct per_cpu_dm_data { struct work_struct dm_alert_work; - struct sk_buff *skb; + struct sk_buff __rcu *skb; atomic_t dm_hit_count; struct timer_list send_timer; }; @@ -73,35 +73,58 @@ static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static LIST_HEAD(hw_stats_list); +static int initialized = 0; static void reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; struct nlattr *nla; + struct sk_buff *skb; + struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1); al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); al += sizeof(struct nlattr); - data->skb = genlmsg_new(al, GFP_KERNEL); - genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, - 0, NET_DM_CMD_ALERT); - nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); - msg = nla_data(nla); - memset(msg, 0, al); - atomic_set(&data->dm_hit_count, dm_hit_limit); + skb = genlmsg_new(al, GFP_KERNEL); + + if (skb) { + genlmsg_put(skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + nla = nla_reserve(skb, NLA_UNSPEC, + sizeof(struct net_dm_alert_msg)); + msg = nla_data(nla); + memset(msg, 0, al); + } else if (initialized) + schedule_work_on(smp_processor_id(), &data->dm_alert_work); + + /* + * Don't need to lock this, since we are guaranteed to only + * run this on a single cpu at a time. + * Note also that we only update data->skb if the old and new skb + * pointers don't match. This ensures that we don't continually call + * synchornize_rcu if we repeatedly fail to alloc a new netlink message. + */ + if (skb != oskb) { + rcu_assign_pointer(data->skb, skb); + + synchronize_rcu(); + + atomic_set(&data->dm_hit_count, dm_hit_limit); + } + } static void send_dm_alert(struct work_struct *unused) { struct sk_buff *skb; - struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); /* * Grab the skb we're about to send */ - skb = data->skb; + skb = rcu_dereference_protected(data->skb, 1); /* * Replace it with a new one @@ -111,8 +134,10 @@ static void send_dm_alert(struct work_struct *unused) /* * Ship it! */ - genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + if (skb) + genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + put_cpu_var(dm_cpu_data); } /* @@ -123,9 +148,11 @@ static void send_dm_alert(struct work_struct *unused) */ static void sched_send_work(unsigned long unused) { - struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); + + schedule_work_on(smp_processor_id(), &data->dm_alert_work); - schedule_work(&data->dm_alert_work); + put_cpu_var(dm_cpu_data); } static void trace_drop_common(struct sk_buff *skb, void *location) @@ -134,9 +161,16 @@ static void trace_drop_common(struct sk_buff *skb, void *location) struct nlmsghdr *nlh; struct nlattr *nla; int i; - struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + struct sk_buff *dskb; + struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); + rcu_read_lock(); + dskb = rcu_dereference(data->skb); + + if (!dskb) + goto out; + if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { /* * we're already at zero, discard this hit @@ -144,7 +178,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) goto out; } - nlh = (struct nlmsghdr *)data->skb->data; + nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); for (i = 0; i < msg->entries; i++) { @@ -158,7 +192,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) /* * We need to create a new entry */ - __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); + __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); msg->points[msg->entries].count = 1; @@ -170,6 +204,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location) } out: + rcu_read_unlock(); + put_cpu_var(dm_cpu_data); return; } @@ -375,6 +411,8 @@ static int __init init_net_drop_monitor(void) data->send_timer.function = sched_send_work; } + initialized = 1; + goto out; out_unreg: -- cgit v1.2.3 From 4b984cd50bc1b6d492175cd77bfabb78e76ffa67 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Thu, 26 Apr 2012 09:45:34 +0200 Subject: ipvs: null check of net->ipvs in lblc(r) shedulers Avoid crash when registering shedulers after the IPVS core initialization for netns fails. Do this by checking for present core (net->ipvs). Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_lblc.c | 3 +++ net/netfilter/ipvs/ip_vs_lblcr.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 0f16283fd058..caa43704e55e 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -551,6 +551,9 @@ static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblc_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index eec797f8cce7..548bf37aa29e 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -745,6 +745,9 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), -- cgit v1.2.3 From 582b8e3eadaec77788c1aa188081a8d5059c42a6 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Thu, 26 Apr 2012 09:45:35 +0200 Subject: ipvs: take care of return value from protocol init_netns ip_vs_create_timeout_table() can return NULL All functions protocol init_netns is affected of this patch. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto.c | 11 +++++++++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 5 ++++- net/netfilter/ipvs/ip_vs_proto_tcp.c | 5 ++++- net/netfilter/ipvs/ip_vs_proto_udp.c | 5 ++++- 4 files changed, 21 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index a62360e29037..ed835e67a07e 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -78,8 +78,15 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ - if (pp->init_netns != NULL) - pp->init_netns(net, pd); + if (pp->init_netns != NULL) { + int ret = pp->init_netns(net, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 1fbf7a2816f5..9f3fb751c491 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -1090,7 +1090,7 @@ out: * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1098,6 +1098,9 @@ static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) spin_lock_init(&ipvs->sctp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index ef8641f7af83..cd609cc62721 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -677,7 +677,7 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -685,7 +685,10 @@ static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) spin_lock_init(&ipvs->tcp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; pd->tcp_state_table = tcp_states; + return 0; } static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index f4b7262896bb..2fedb2dcb3d1 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -467,7 +467,7 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; } -static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -475,6 +475,9 @@ static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) spin_lock_init(&ipvs->udp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) -- cgit v1.2.3 From 8537de8a7ab6681cc72fb0411ab1ba7fdba62dd0 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Thu, 26 Apr 2012 07:47:44 +0200 Subject: ipvs: kernel oops - do_ip_vs_get_ctl Change order of init so netns init is ready when register ioctl and netlink. Ver2 Whitespace fixes and __init added. Reported-by: "Ryan O'Hara" Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 9 +++++++ net/netfilter/ipvs/ip_vs_ctl.c | 52 ++++++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 260b9ef88775..00bdb1d9d690 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1995,10 +1995,18 @@ static int __init ip_vs_init(void) goto cleanup_dev; } + ret = ip_vs_register_nl_ioctl(); + if (ret < 0) { + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_hooks; + } + pr_info("ipvs loaded.\n"); return ret; +cleanup_hooks: + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: @@ -2014,6 +2022,7 @@ exit: static void __exit ip_vs_cleanup(void) { + ip_vs_unregister_nl_ioctl(); nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 376d2b12d581..f5589987fc80 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3750,21 +3750,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) free_percpu(ipvs->tot_stats.cpustats); } -int __init ip_vs_control_init(void) +int __init ip_vs_register_nl_ioctl(void) { - int idx; int ret; - EnterFunction(2); - - /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - - smp_wmb(); /* Do we really need it now ? */ - ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); @@ -3776,28 +3765,47 @@ int __init ip_vs_control_init(void) pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } - - ret = register_netdevice_notifier(&ip_vs_dst_notifier); - if (ret < 0) - goto err_notf; - - LeaveFunction(2); return 0; -err_notf: - ip_vs_genl_unregister(); err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; + + LeaveFunction(2); + return 0; +} + void ip_vs_control_cleanup(void) { EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); } -- cgit v1.2.3 From 6cf51852486af3d79f57bf46d00209a14244dbaa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 27 Apr 2012 02:00:50 +0200 Subject: netfilter: xt_CT: fix wrong checking in the timeout assignment path The current checking always succeeded. We have to check the first character of the string to check that it's empty, thus, skipping the timeout path. This fixes the use of the CT target without the timeout option. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CT.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 59530e93fa58..3746d8b9a478 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -227,7 +227,7 @@ static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT - if (info->timeout) { + if (info->timeout[0]) { typeof(nf_ct_timeout_find_get_hook) timeout_find_get; struct nf_conn_timeout *timeout_ext; -- cgit v1.2.3 From cbbb34498f8b2b26cbdc79532c8a2ee5cd1e756a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 30 Apr 2012 11:52:40 -0400 Subject: SUNRPC: RPC client must use the current utsname hostname string Now that the rpc client is namespace aware, it needs to use the utsname of the process that created it instead of using the init_utsname. Both rpc_new_client and rpc_clone_client need to be fixed. Signed-off-by: Trond Myklebust Cc: Stanislav Kinsbursky --- net/sunrpc/clnt.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d127bd747527..adf2990acebf 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -279,6 +279,14 @@ void rpc_clients_notifier_unregister(void) return rpc_pipefs_notifier_unregister(&rpc_clients_block); } +static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) +{ + clnt->cl_nodelen = strlen(nodename); + if (clnt->cl_nodelen > UNX_MAXNODENAME) + clnt->cl_nodelen = UNX_MAXNODENAME; + memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen); +} + static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt) { const struct rpc_program *program = args->program; @@ -359,10 +367,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru } /* save the nodename */ - clnt->cl_nodelen = strlen(init_utsname()->nodename); - if (clnt->cl_nodelen > UNX_MAXNODENAME) - clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen); + rpc_clnt_set_nodename(clnt, utsname()->nodename); rpc_register_client(clnt); return clnt; @@ -521,6 +526,7 @@ rpc_clone_client(struct rpc_clnt *clnt) err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); if (err != 0) goto out_no_path; + rpc_clnt_set_nodename(new, utsname()->nodename); if (new->cl_auth) atomic_inc(&new->cl_auth->au_count); atomic_inc(&clnt->cl_count); -- cgit v1.2.3 From 1cebce36d660c83bd1353e41f3e66abd4686f215 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 30 Apr 2012 06:00:18 +0000 Subject: tcp: fix infinite cwnd in tcp_complete_cwr() When the cwnd reduction is done, ssthresh may be infinite if TCP enters CWR via ECN or F-RTO. If cwnd is not undone, i.e., undo_marker is set, tcp_complete_cwr() falsely set cwnd to the infinite ssthresh value. The correct operation is to keep cwnd intact because it has been updated in ECN or F-RTO. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2a702e3a4ae6..d99efd7dfb6c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2868,11 +2868,14 @@ static inline void tcp_complete_cwr(struct sock *sk) /* Do not moderate cwnd if it's already undone in cwr or recovery. */ if (tp->undo_marker) { - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) + if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - else /* PRR */ + tp->snd_cwnd_stamp = tcp_time_stamp; + } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { + /* PRR algorithm. */ tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_time_stamp; + } } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } -- cgit v1.2.3 From 66f2c99af3d6f2d0aa1120884cf1c60613ef61c0 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 29 Apr 2012 15:44:16 +0200 Subject: mac80211: fix AP mode EAP tx for VLAN stations EAP frames for stations in an AP VLAN are sent on the main AP interface to avoid race conditions wrt. moving stations. For that to work properly, sta_info_get_bss must be used instead of sta_info_get when sending EAP packets. Previously this was only done for cooked monitor injected packets, so this patch adds a check for tx->skb->protocol to the same place. Signed-off-by: Felix Fietkau Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- net/mac80211/tx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 782a60198df4..e76facc69e95 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1158,7 +1158,8 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, tx->sta = rcu_dereference(sdata->u.vlan.sta); if (!tx->sta && sdata->dev->ieee80211_ptr->use_4addr) return TX_DROP; - } else if (info->flags & IEEE80211_TX_CTL_INJECTED) { + } else if (info->flags & IEEE80211_TX_CTL_INJECTED || + tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } if (!tx->sta) -- cgit v1.2.3 From 116a0fc31c6c9b8fc821be5a96e5bf0b43260131 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Apr 2012 09:08:22 +0000 Subject: netem: fix possible skb leak skb_checksum_help(skb) can return an error, we must free skb in this case. qdisc_drop(skb, sch) can also be feeded with a NULL skb (if skb_unshare() failed), so lets use this generic helper. Signed-off-by: Eric Dumazet Cc: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5da548fa7ae9..ebd22966f748 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -408,10 +408,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) { - sch->qstats.drops++; - return NET_XMIT_DROP; - } + skb_checksum_help(skb))) + return qdisc_drop(skb, sch); skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); } -- cgit v1.2.3 From 4fdcfa12843bca38d0c9deff70c8720e4e8f515f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 1 May 2012 08:18:02 +0000 Subject: drop_monitor: prevent init path from scheduling on the wrong cpu I just noticed after some recent updates, that the init path for the drop monitor protocol has a minor error. drop monitor maintains a per cpu structure, that gets initalized from a single cpu. Normally this is fine, as the protocol isn't in use yet, but I recently made a change that causes a failed skb allocation to reschedule itself . Given the current code, the implication is that this workqueue reschedule will take place on the wrong cpu. If drop monitor is used early during the boot process, its possible that two cpus will access a single per-cpu structure in parallel, possibly leading to data corruption. This patch fixes the situation, by storing the cpu number that a given instance of this per-cpu data should be accessed from. In the case of a need for a reschedule, the cpu stored in the struct is assigned the rescheule, rather than the currently executing cpu Tested successfully by myself. Signed-off-by: Neil Horman CC: David Miller Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 7592943513e3..a7cad741df01 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -49,6 +49,7 @@ struct per_cpu_dm_data { struct sk_buff __rcu *skb; atomic_t dm_hit_count; struct timer_list send_timer; + int cpu; }; struct dm_hw_stat_delta { @@ -73,7 +74,6 @@ static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static LIST_HEAD(hw_stats_list); -static int initialized = 0; static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -96,8 +96,8 @@ static void reset_per_cpu_data(struct per_cpu_dm_data *data) sizeof(struct net_dm_alert_msg)); msg = nla_data(nla); memset(msg, 0, al); - } else if (initialized) - schedule_work_on(smp_processor_id(), &data->dm_alert_work); + } else + schedule_work_on(data->cpu, &data->dm_alert_work); /* * Don't need to lock this, since we are guaranteed to only @@ -121,6 +121,8 @@ static void send_dm_alert(struct work_struct *unused) struct sk_buff *skb; struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); + WARN_ON_ONCE(data->cpu != smp_processor_id()); + /* * Grab the skb we're about to send */ @@ -404,14 +406,14 @@ static int __init init_net_drop_monitor(void) for_each_present_cpu(cpu) { data = &per_cpu(dm_cpu_data, cpu); - reset_per_cpu_data(data); + data->cpu = cpu; INIT_WORK(&data->dm_alert_work, send_dm_alert); init_timer(&data->send_timer); data->send_timer.data = cpu; data->send_timer.function = sched_send_work; + reset_per_cpu_data(data); } - initialized = 1; goto out; -- cgit v1.2.3 From 84768edbb2721637620b2d84501bb0d5aed603f1 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 2 May 2012 03:58:43 +0000 Subject: net: l2tp: unlock socket lock before returning from l2tp_ip_sendmsg l2tp_ip_sendmsg could return without releasing socket lock, making it all the way to userspace, and generating the following warning: [ 130.891594] ================================================ [ 130.894569] [ BUG: lock held when returning to user space! ] [ 130.897257] 3.4.0-rc5-next-20120501-sasha #104 Tainted: G W [ 130.900336] ------------------------------------------------ [ 130.902996] trinity/8384 is leaving the kernel with locks still held! [ 130.906106] 1 lock held by trinity/8384: [ 130.907924] #0: (sk_lock-AF_INET){+.+.+.}, at: [] l2tp_ip_sendmsg+0x2f/0x550 Introduced by commit 2f16270 ("l2tp: Fix locking in l2tp_ip.c"). Signed-off-by: Sasha Levin Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 585d93ecee2d..6274f0be82b0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -442,8 +442,9 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m daddr = lip->l2tp_addr.s_addr; } else { + rc = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) - return -EDESTADDRREQ; + goto out; daddr = inet->inet_daddr; connected = 1; -- cgit v1.2.3 From b49960a05e32121d29316cfdf653894b88ac9190 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2012 02:28:41 +0000 Subject: tcp: change tcp_adv_win_scale and tcp_rmem[2] tcp_adv_win_scale default value is 2, meaning we expect a good citizen skb to have skb->len / skb->truesize ratio of 75% (3/4) In 2.6 kernels we (mis)accounted for typical MSS=1460 frame : 1536 + 64 + 256 = 1856 'estimated truesize', and 1856 * 3/4 = 1392. So these skbs were considered as not bloated. With recent truesize fixes, a typical MSS=1460 frame truesize is now the more precise : 2048 + 256 = 2304. But 2304 * 3/4 = 1728. So these skb are not good citizen anymore, because 1460 < 1728 (GRO can escape this problem because it build skbs with a too low truesize.) This also means tcp advertises a too optimistic window for a given allocated rcvspace : When receiving frames, sk_rmem_alloc can hit sk_rcvbuf limit and we call tcp_prune_queue()/tcp_collapse() too often, especially when application is slow to drain its receive queue or in case of losses (netperf is fast, scp is slow). This is a major latency source. We should adjust the len/truesize ratio to 50% instead of 75% This patch : 1) changes tcp_adv_win_scale default to 1 instead of 2 2) increase tcp_rmem[2] limit from 4MB to 6MB to take into account better truesize tracking and to allow autotuning tcp receive window to reach same value than before. Note that same amount of kernel memory is consumed compared to 2.6 kernels. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Tom Herbert Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 +++++---- net/ipv4/tcp_input.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8bb6adeb62c0..1272a88c2a63 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3243,7 +3243,7 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int max_share, cnt; + int max_rshare, max_wshare, cnt; unsigned int i; unsigned long jiffy = jiffies; @@ -3303,15 +3303,16 @@ void __init tcp_init(void) tcp_init_mem(&init_net); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_share = min(4UL*1024*1024, limit); + max_wshare = min(4UL*1024*1024, limit); + max_rshare = min(6UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_share); + sysctl_tcp_wmem[2] = max(64*1024, max_wshare); sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_share); + sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d99efd7dfb6c..257b61789eeb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,7 @@ int sysctl_tcp_ecn __read_mostly = 2; EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 2; +int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); int sysctl_tcp_stdurg __read_mostly; -- cgit v1.2.3