diff options
Diffstat (limited to 'net/ipv4')
36 files changed, 454 insertions, 267 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f31c09873d0f..d0ef51d49f8a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1883,6 +1883,10 @@ static int __init inet_init(void) ip_init(); + /* Initialise per-cpu ipv4 mibs */ + if (init_ipv4_mibs()) + panic("%s: Cannot init ipv4 mibs\n", __func__); + /* Setup TCP slab cache for open requests. */ tcp_init(); @@ -1911,12 +1915,6 @@ static int __init inet_init(void) if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); - /* - * Initialise per-cpu ipv4 mibs - */ - - if (init_ipv4_mibs()) - pr_crit("%s: Cannot init ipv4 mibs\n", __func__); ipv4_proc_init(); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 5535b722f66d..e8b8dd1cb157 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -486,6 +486,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); + kfree(doi_def->map.std); break; } kfree(doi_def); @@ -533,16 +534,10 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) ret_val = -ENOENT; goto doi_remove_return; } - if (!refcount_dec_and_test(&doi_def->refcount)) { - spin_unlock(&cipso_v4_doi_list_lock); - ret_val = -EBUSY; - goto doi_remove_return; - } list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); - cipso_v4_cache_invalidate(); - call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); + cipso_v4_doi_putdef(doi_def); ret_val = 0; doi_remove_return: @@ -599,9 +594,6 @@ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) if (!refcount_dec_and_test(&doi_def->refcount)) return; - spin_lock(&cipso_v4_doi_list_lock); - list_del_rcu(&doi_def->list); - spin_unlock(&cipso_v4_doi_list_lock); cipso_v4_cache_invalidate(); call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 096a28f9720d..d9bb3ae78560 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -262,6 +262,7 @@ static struct in_device *inetdev_init(struct net_device *dev) err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; + neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; @@ -2323,7 +2324,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, free: kfree(t); out: - return -ENOBUFS; + return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index c8e32f167ebb..9e664b6cef13 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -252,7 +252,6 @@ static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struc int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; - u8 *vaddr; int nfrags; int esph_offset; struct page *page; @@ -294,14 +293,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * page = pfrag->page; get_page(page); - vaddr = kmap_atomic(page); - - tail = vaddr + pfrag->offset; + tail = page_address(page) + pfrag->offset; esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); - kunmap_atomic(vaddr); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ff499000f6cd..ee467d744b07 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index eff703cb13b6..bc233fdfae0f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -839,7 +839,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - if (cfg->fc_table) + if (cfg->fc_table && cfg->fc_table != RT_TABLE_MAIN) tbl = fib_get_table(net, cfg->fc_table); if (tbl) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3f9509679f0e..0c8fcc050ad2 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1729,7 +1729,7 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp; - hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 6a7d980105f6..095c30863745 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -19,12 +19,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool need_csum, need_recompute_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, gso_partial; if (!skb->encapsulation) goto out; @@ -45,6 +45,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; @@ -102,7 +103,15 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } *(pcsum + 1) = 0; - *pcsum = gso_make_checksum(skb, 0); + if (need_recompute_csum && !skb_is_gso(skb)) { + __wsum csum; + + csum = skb_checksum(skb, gre_offset, + skb->len - gre_offset, 0); + *pcsum = csum_fold(csum); + } else { + *pcsum = gso_make_checksum(skb, 0); + } } while ((skb = skb->next)); out: return segs; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 995ef3d23368..dc99b40da48d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -244,7 +244,7 @@ static struct { /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * - * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. * Note: called with BH disabled */ @@ -272,7 +272,10 @@ bool icmp_global_allow(void) } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { - credit--; + /* We want to use a credit of one in average, but need to randomize + * it for security reasons. + */ + credit = max_t(int, credit - prandom_u32_max(3), 0); rc = true; } WRITE_ONCE(icmp_global.credit, credit); @@ -465,6 +468,23 @@ out_bh_enable: local_bh_enable(); } +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -473,6 +493,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -487,7 +508,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); @@ -511,7 +533,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) @@ -740,6 +762,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); + /* if we don't have a source address at this point, fall back to the + * dummy address instead of sending out a packet with a source address + * of 0.0.0.0 + */ + if (!fl4.saddr) + fl4.saddr = htonl(INADDR_DUMMY); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); @@ -751,6 +780,40 @@ out:; } EXPORT_SYMBOL(__icmp_send); +#if IS_ENABLED(CONFIG_NF_NAT) +#include <net/netfilter/nf_conntrack.h> +void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct sk_buff *cloned_skb = NULL; + struct ip_options opts = { 0 }; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + __be32 orig_ip; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { + __icmp_send(skb_in, type, code, info, &opts); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct iphdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct iphdr)))) + goto out; + + orig_ip = ip_hdr(skb_in)->saddr; + ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; + __icmp_send(skb_in, type, code, info, &opts); + ip_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmp_ndo_send); +#endif static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b6f0ee01f2e0..eac0013809c9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1790,6 +1790,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; + ip_mc_clear_src(i); ip_ma_put(i); } } @@ -2694,6 +2695,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { + spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; @@ -2704,6 +2706,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; + spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7826fba34b14..08ba0f91f2ab 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -276,51 +276,12 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true); } -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - * We try to allocate an odd port (and leave even ports for connect()) - */ -int inet_csk_get_port(struct sock *sk, unsigned short snum) +void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, + struct sock *sk) { - bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; - struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - int ret = 1, port = snum; - struct inet_bind_hashbucket *head; - struct net *net = sock_net(sk); - struct inet_bind_bucket *tb = NULL; kuid_t uid = sock_i_uid(sk); + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; - if (!port) { - head = inet_csk_find_open_port(sk, &tb, &port); - if (!head) - return ret; - if (!tb) - goto tb_not_found; - goto success; - } - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock_bh(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) - goto tb_found; -tb_not_found: - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); - if (!tb) - goto fail_unlock; -tb_found: - if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse == SK_FORCE_REUSE) - goto success; - - if ((tb->fastreuse > 0 && reuse) || - sk_reuseport_match(tb, sk)) - goto success; - if (inet_csk_bind_conflict(sk, tb, true, true)) - goto fail_unlock; - } -success: if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { @@ -364,6 +325,54 @@ success: tb->fastreuseport = 0; } } +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) + */ +int inet_csk_get_port(struct sock *sk, unsigned short snum) +{ + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, port = snum; + struct inet_bind_hashbucket *head; + struct net *net = sock_net(sk); + struct inet_bind_bucket *tb = NULL; + + if (!port) { + head = inet_csk_find_open_port(sk, &tb, &port); + if (!head) + return ret; + if (!tb) + goto tb_not_found; + goto success; + } + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; +tb_not_found: + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) + goto fail_unlock; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse == SK_FORCE_REUSE) + goto success; + + if ((tb->fastreuse > 0 && reuse) || + sk_reuseport_match(tb, sk)) + goto success; + if (inet_csk_bind_conflict(sk, tb, true, true)) + goto fail_unlock; + } +success: + inet_csk_update_fastreuse(tb, sk); + if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7ba013d6c00a..5fac8d776391 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -393,8 +393,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_inode = 0; if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); return -EMSGSIZE; + } nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0af13f5bdc9a..8a54babf5c90 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -160,6 +160,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOMEM; } } + inet_csk_update_fastreuse(tb, child); } inet_bind_hash(child, tb, port); spin_unlock(&head->lock); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9940a59306b5..1a4d89f8361c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -443,6 +443,8 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static int gre_handle_offloads(struct sk_buff *skb, bool csum) { + if (csum && skb_checksum_start(skb) < skb->data) + return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 73cd64c7692f..c9f82525bfa4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -73,6 +73,7 @@ #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> +#include <net/inet_ecn.h> #include <net/lwtunnel.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> @@ -311,7 +312,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); - if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); return ip_finish_output2(net, sk, skb); @@ -418,8 +419,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); - memcpy(&iph->saddr, &fl4->saddr, - sizeof(fl4->saddr) + sizeof(fl4->daddr)); + + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ @@ -1562,7 +1564,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (IS_ERR(rt)) return; - inet_sk(sk)->tos = arg->tos; + inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index f6793017a20d..e9cf0d185459 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -98,9 +98,10 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, __be32 remote, __be32 local, __be32 key) { - unsigned int hash; struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; + struct net_device *ndev; + unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; @@ -175,8 +176,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (t && t->dev->flags & IFF_UP) return t; - if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(itn->fb_tunnel_dev); + ndev = READ_ONCE(itn->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -750,7 +752,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) { + df = tnl_params->frag_off; + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) + df |= (inner_iph->frag_off & htons(IP_DF)); + + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) { ip_rt_put(rt); goto tx_error; } @@ -778,10 +784,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ttl = ip4_dst_hoplimit(&rt->dst); } - df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) - df |= (inner_iph->frag_off&htons(IP_DF)); - max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > dev->needed_headroom) @@ -1211,9 +1213,9 @@ void ip_tunnel_uninit(struct net_device *dev) struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); - /* fb_tunnel_dev will be unregisted in net-exit call. */ - if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(itn, netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); + if (itn->fb_tunnel_dev == dev) + WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index c1693d75e196..33a85269a9f2 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -50,7 +50,7 @@ static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) + int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); @@ -65,6 +65,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + if (update_skb_dev) + skb->dev = tunnel->dev; + return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -74,25 +77,43 @@ drop: return 0; } -static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) +static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { - struct ip_tunnel *tunnel; + return vti_input(skb, nexthdr, spi, encap_type, false); +} + +static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); +} + +static int vti_rcv_proto(struct sk_buff *skb) +{ + return vti_rcv(skb, 0, false); +} + +static int vti_rcv_tunnel(struct sk_buff *skb) +{ + struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id); const struct iphdr *iph = ip_hdr(skb); - struct net *net = dev_net(skb->dev); - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + struct ip_tunnel *tunnel; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { + struct tnl_ptk_info tpi = { + .proto = htons(ETH_P_IP), + }; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - - skb->dev = tunnel->dev; - - return xfrm_input(skb, nexthdr, spi, encap_type); + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) + goto drop; + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false); } return -EINVAL; @@ -101,22 +122,6 @@ drop: return 0; } -static int vti_rcv(struct sk_buff *skb) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - - return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); -} - -static int vti_rcv_ipip(struct sk_buff *skb) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - - return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); -} - static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -482,31 +487,31 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm_tunnel ipip_handler __read_mostly = { - .handler = vti_rcv_ipip, + .handler = vti_rcv_tunnel, .err_handler = vti4_err, .priority = 0, }; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f0782c91514c..41e384834d50 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -881,7 +881,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d /* - * Copy BOOTP-supplied string if not already set. + * Copy BOOTP-supplied string */ static int __init ic_bootp_string(char *dest, char *src, int len, int max) { @@ -930,12 +930,15 @@ static void __init ic_do_bootp_ext(u8 *ext) } break; case 12: /* Host name */ - ic_bootp_string(utsname()->nodename, ext+1, *ext, - __NEW_UTS_LEN); - ic_host_name_set = 1; + if (!ic_host_name_set) { + ic_bootp_string(utsname()->nodename, ext+1, *ext, + __NEW_UTS_LEN); + ic_host_name_set = 1; + } break; case 15: /* Domain name (DNS) */ - ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + if (!ic_domain[0]) + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); break; case 17: /* Root path */ if (!root_server_path[0]) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index cdd627355ed1..df610245d21a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -702,7 +702,7 @@ out: rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) - xfrm4_tunnel_deregister(&mplsip_handler, AF_INET); + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mplsip_failed: #endif diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 6dd727e0a72f..13a46066546d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1196,6 +1196,8 @@ static int translate_compat_table(struct net *net, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 0d1a2cda1bfb..a4039ac65b54 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1432,6 +1432,8 @@ translate_compat_table(struct net *net, if (!newinfo) goto out_unlock; + memset(newinfo->entries, 0, size); + newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 341d1bd637af..6b8908e1e0ba 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -94,7 +94,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_oif = 0; flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = RT_TOS(iph->tos); + flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 8a69363b4884..c66103de86bd 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -165,8 +165,7 @@ pptp_outbound_pkt(struct sk_buff *skb, break; default: pr_debug("unknown outbound packet 0x%04x:%s\n", msg, - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : - pptp_msg_name[0]); + pptp_msg_name(msg)); /* fall through */ case PPTP_SET_LINK_INFO: /* only need to NAT in case PAC is behind NAT box */ @@ -267,9 +266,7 @@ pptp_inbound_pkt(struct sk_buff *skb, pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID); break; default: - pr_debug("unknown inbound packet %s\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : - pptp_msg_name[0]); + pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg)); /* fall through */ case PPTP_START_SESSION_REQUEST: case PPTP_START_SESSION_REPLY: diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 16226d49263d..aab141c4a389 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -801,6 +801,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); + fl4.fl4_icmp_type = user_icmph.type; + fl4.fl4_icmp_code = user_icmph.code; + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { @@ -975,6 +978,7 @@ bool ping_rcv(struct sk_buff *skb) struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + bool rc = false; /* We assume the packet has already been checked by icmp_rcv */ @@ -989,14 +993,15 @@ bool ping_rcv(struct sk_buff *skb) struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); pr_debug("rcv on socket %p\n", sk); - if (skb2) - ping_queue_rcv_skb(sk, skb2); + if (skb2 && !ping_queue_rcv_skb(sk, skb2)) + rc = true; sock_put(sk); - return true; } - pr_debug("no socket, dropping\n"); - return false; + if (!rc) + pr_debug("no socket, dropping\n"); + + return rc; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 967acf2844ef..34cf572cc5dc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -70,6 +70,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -276,6 +277,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } + (*pos)++; return NULL; } @@ -484,8 +486,10 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } -#define IP_IDENTS_SZ 2048u - +/* Hash tables of size 2048..262144 depending on RAM size. + * Each bucket uses 8 bytes. + */ +static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; @@ -495,22 +499,24 @@ static u32 *ip_tstamps __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; - atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); - u32 now = (u32)jiffies; - u32 new, delta = 0; + u32 bucket, old, now = (u32)jiffies; + atomic_t *p_id; + u32 *p_tstamp; + u32 delta = 0; + + bucket = hash & ip_idents_mask; + p_tstamp = ip_tstamps + bucket; + p_id = ip_idents + bucket; + old = ACCESS_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - /* Do not use atomic_add_return() as it makes UBSAN unhappy */ - do { - old = (u32)atomic_read(p_id); - new = old + delta + segs; - } while (atomic_cmpxchg(p_id, old, new) != old); - - return new - segs; + /* If UBSAN reports an error there, please make sure your compiler + * supports -fno-strict-overflow before reporting it that was a bug + * in UBSAN, and it has been fixed in GCC-8. + */ + return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); @@ -613,28 +619,35 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) } } -static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { - struct fib_nh_exception *fnhe, *oldest; + struct fib_nh_exception __rcu **fnhe_p, **oldest_p; + struct fib_nh_exception *fnhe, *oldest = NULL; - oldest = rcu_dereference(hash->chain); - for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; - fnhe = rcu_dereference(fnhe->fnhe_next)) { - if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { + fnhe = rcu_dereference_protected(*fnhe_p, + lockdep_is_held(&fnhe_lock)); + if (!fnhe) + break; + if (!oldest || + time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; + oldest_p = fnhe_p; + } } fnhe_flush_routes(oldest); - return oldest; + *oldest_p = oldest->fnhe_next; + kfree_rcu(oldest, rcu); } -static inline u32 fnhe_hashfun(__be32 daddr) +static u32 fnhe_hashfun(__be32 daddr) { - static u32 fnhe_hashrnd __read_mostly; - u32 hval; + static siphash_key_t fnhe_hash_key __read_mostly; + u64 hval; - net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); - hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); - return hash_32(hval, FNHE_HASH_SHIFT); + net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); + hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); + return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) @@ -701,16 +714,21 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, if (rt) fill_route_from_fnhe(rt, fnhe); } else { - if (depth > FNHE_RECLAIM_DEPTH) - fnhe = fnhe_oldest(hash); - else { - fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); - if (!fnhe) - goto out_unlock; - - fnhe->fnhe_next = hash->chain; - rcu_assign_pointer(hash->chain, fnhe); + /* Randomize max depth to avoid some side channels attacks. */ + int max_depth = FNHE_RECLAIM_DEPTH + + prandom_u32_max(FNHE_RECLAIM_DEPTH); + + while (depth > max_depth) { + fnhe_remove_oldest(hash); + depth--; } + + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; @@ -718,6 +736,8 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); + rcu_assign_pointer(hash->chain, fnhe); + /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. @@ -796,6 +816,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); + fib_select_path(net, &res, fl4, skb); + nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); @@ -1012,6 +1034,7 @@ out: kfree_skb(skb); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; + struct net *net = dev_net(dst->dev); u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; @@ -1032,9 +1055,11 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { - struct fib_nh *nh = &FIB_RES_NH(res); + if (fib_lookup(net, fl4, &res, 0) == 0) { + struct fib_nh *nh; + fib_select_path(net, &res, fl4, NULL); + nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } @@ -1433,6 +1458,24 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, return ret; } +struct uncached_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); + + rt->rt_uncached_list = ul; + + spin_lock_bh(&ul->lock); + list_add_tail(&rt->rt_uncached, &ul->head); + spin_unlock_bh(&ul->lock); +} + static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p; @@ -1452,7 +1495,7 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { - dst_dev_put(&orig->dst); + rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { @@ -1463,24 +1506,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) return ret; } -struct uncached_list { - spinlock_t lock; - struct list_head head; -}; - -static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); - -static void rt_add_uncached_list(struct rtable *rt) -{ - struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); - - rt->rt_uncached_list = ul; - - spin_lock_bh(&ul->lock); - list_add_tail(&rt->rt_uncached, &ul->head); - spin_unlock_bh(&ul->lock); -} - static void ipv4_dst_destroy(struct dst_entry *dst) { struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); @@ -2507,8 +2532,6 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); - fl4->flowi4_oif = dev_out->ifindex; - make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); @@ -2601,10 +2624,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (IS_ERR(rt)) return rt; - if (flp4->flowi4_proto) + if (flp4->flowi4_proto) { + flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); + } return rt; } @@ -2784,7 +2809,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; @@ -2803,8 +2828,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->protocol = htons(ETH_P_IP); skb->dev = dev; skb->mark = mark; - err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, - dev, &res); + err = ip_route_input_rcu(skb, dst, src, + rtm->rtm_tos & IPTOS_RT_MASK, dev, + &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) @@ -3093,18 +3119,26 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { + void *idents_hash; int rc = 0; int cpu; - ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); - if (!ip_idents) - panic("IP: failed to allocate ip_idents\n"); + /* For modern hosts, this will use 2 MB of memory */ + idents_hash = alloc_large_system_hash("IP idents", + sizeof(*ip_idents) + sizeof(*ip_tstamps), + 0, + 16, /* one bucket per 64 KB */ + HASH_ZERO, + NULL, + &ip_idents_mask, + 2048, + 256*1024); + + ip_idents = idents_hash; - prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); - ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); - if (!ip_tstamps) - panic("IP: failed to allocate ip_tstamps\n"); + ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2f871424925e..bf8df824bae3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -296,7 +296,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct rtable *rt; __u8 rcv_wscale; struct flowi4 fl4; @@ -389,8 +389,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6fbfdd5e96de..c9f6f28e54f3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2366,6 +2366,9 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tp->delivered = 0; + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -2759,10 +2762,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: - if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) - err = tp->af_specific->md5_parse(sk, optname, optval, optlen); - else - err = -EINVAL; + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_USER_TIMEOUT: @@ -3394,10 +3394,13 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { + u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; - sg_init_one(&sg, key->key, key->keylen); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); + sg_init_one(&sg, key->key, keylen); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); + + /* tcp_md5_do_add() might change key->key under us */ return crypto_ahash_update(hp->md5_req); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 434ad1e72447..76a9652d90f2 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -769,7 +769,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + (rs->rtt_us < bbr->min_rtt_us || filter_expired)) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; } @@ -840,7 +840,7 @@ static void bbr_init(struct sock *sk) bbr->prior_cwnd = 0; bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ bbr->rtt_cnt = 0; - bbr->next_rtt_delivered = 0; + bbr->next_rtt_delivered = tp->delivered; bbr->prev_ca_state = TCP_CA_Open; bbr->packet_conservation = 0; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 494e3c3a21a1..b193dcebbf7e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -199,7 +199,12 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); - if (sk->sk_state != TCP_CLOSE) + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); + + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 78bfadfcf342..93530bd33247 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -340,8 +340,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; if (tcp_in_slow_start(tp)) { - if (hystart && after(ack, ca->end_seq)) - bictcp_hystart_reset(sk); acked = tcp_slow_start(tp, acked); if (!acked) return; @@ -383,6 +381,9 @@ static void hystart_update(struct sock *sk, u32 delay) if (ca->found & hystart_detect) return; + if (after(tp->snd_una, ca->end_seq)) + bictcp_hystart_reset(sk); + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); @@ -403,6 +404,8 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ + if (ca->curr_rtt > delay) + ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { if (ca->curr_rtt == 0 || ca->curr_rtt > delay) ca->curr_rtt = delay; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 29f3df4ddd1f..9382caeb721a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1225,7 +1225,7 @@ static u8 tcp_sacktag_one(struct sock *sk, if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); } @@ -2803,7 +2803,8 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; - tcp_rack_mark_lost(sk); + if (tcp_rack_mark_lost(sk)) + *ack_flag &= ~FLAG_SET_XMIT_TIMER; if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } @@ -3516,10 +3517,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -3528,7 +3527,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (before(ack, tp->tlp_high_seq)) return; - if (flag & FLAG_DSACKING_ACK) { + if (!tp->tlp_retrans) { + /* TLP of new data has been acknowledged */ + tp->tlp_high_seq = 0; + } else if (flag & FLAG_DSACKING_ACK) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { @@ -3687,15 +3689,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - /* If needed, reset TLP/RTO timer; RACK may later override this. */ - if (flag & FLAG_SET_XMIT_TIMER) - tcp_set_xmit_timer(sk); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } + /* If needed, reset TLP/RTO timer when RACK doesn't set. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); @@ -4507,7 +4510,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4591,7 +4598,11 @@ add_sack: tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); skb_condense(skb); skb_set_owner_r(skb, sk); } @@ -5530,6 +5541,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; + } else { + tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); } __tcp_ack_snd_check(sk, 0); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b4f0fc34b0ed..744479c8b91f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -285,7 +285,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; - mtu = tcp_sk(sk)->mtu_info; + mtu = READ_ONCE(tcp_sk(sk)->mtu_info); dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -453,7 +453,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (sk->sk_state == TCP_LISTEN) goto out; - tp->mtu_info = info; + WRITE_ONCE(tp->mtu_info, info); if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); } else { @@ -995,9 +995,18 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); if (key) { - /* Pre-existing entry - just update that one. */ + /* Pre-existing entry - just update that one. + * Note that the key might be used concurrently. + */ memcpy(key->key, newkey, newkeylen); - key->keylen = newkeylen; + + /* Pairs with READ_ONCE() in tcp_md5_hash_key(). + * Also note that a reader could catch new key->keylen value + * but old key->key[], this is the reason we use __GFP_ZERO + * at sock_kmalloc() time below these lines. + */ + WRITE_ONCE(key->keylen, newkeylen); + return 0; } @@ -1013,7 +1022,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, rcu_assign_pointer(tp->md5sig_info, md5sig); } - key = sock_kmalloc(sk, sizeof(*key), gfp); + key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); if (!key) return -ENOMEM; if (!tcp_alloc_md5sig_pool()) { @@ -2078,6 +2087,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2088,7 +2098,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_next(seq, NULL); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2099,7 +2109,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e1eb56e21dd5..99ab936a9cd3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -616,7 +616,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -631,7 +632,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ - ireq->tstamp_ok &= !ireq->sack_ok; + if (synack_type != TCP_SYNACK_COOKIE) + ireq->tstamp_ok &= !ireq->sack_ok; } #endif @@ -1468,6 +1470,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } +EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) @@ -1618,7 +1621,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * window, and remember whether we were cwnd-limited then. */ if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out) { + tp->packets_out > tp->max_packets_out || + is_cwnd_limited) { tp->max_packets_out = tp->packets_out; tp->max_packets_seq = tp->snd_nxt; tp->is_cwnd_limited = is_cwnd_limited; @@ -2409,6 +2413,10 @@ repair: else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + if (likely(sent_pkts || is_cwnd_limited)) + tcp_cwnd_validate(sk, is_cwnd_limited); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2416,8 +2424,6 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && tcp_send_head(sk); @@ -2498,6 +2504,11 @@ void tcp_send_loss_probe(struct sock *sk) int pcount; int mss = tcp_current_mss(sk); + /* At most one outstanding TLP */ + if (tp->tlp_high_seq) + goto rearm_timer; + + tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb) { if (tcp_snd_wnd_test(tp, skb, mss)) { @@ -2520,10 +2531,6 @@ void tcp_send_loss_probe(struct sock *sk) return; } - /* At most one outstanding TLP retransmission. */ - if (tp->tlp_high_seq) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2544,10 +2551,12 @@ void tcp_send_loss_probe(struct sock *sk) if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; + tp->tlp_retrans = 1; + +probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; -probe_sent: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; @@ -3252,8 +3261,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + - sizeof(*th); + tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, + foc, synack_type) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index be8ef1e5dfef..0409cd2c4918 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -102,13 +102,13 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) } } -void tcp_rack_mark_lost(struct sock *sk) +bool tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; if (!tp->rack.advanced) - return; + return false; /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; @@ -118,6 +118,7 @@ void tcp_rack_mark_lost(struct sock *sk) inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } + return !!timeout; } /* Record the most recently (re)sent time among the (s)acked packets diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e33258d69246..fee1cdcc224e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -882,7 +882,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; @@ -1165,7 +1165,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, } up->len += size; - if (!(up->corkflag || (flags&MSG_MORE))) + if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) ret = udp_push_pending_frames(sk); if (!ret) ret = size; @@ -1894,7 +1894,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { /* * MIB statistics other than incrementing the error count are @@ -2321,7 +2321,8 @@ int udp_v4_early_demux(struct sk_buff *skb) */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, - iph->saddr, iph->tos, + iph->saddr, + iph->tos & IPTOS_RT_MASK, skb->dev, in_dev, &itag); } return 0; @@ -2336,6 +2337,9 @@ void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); + + /* protects from races with udp_abort() */ + sock_set_flag(sk, SOCK_DEAD); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); if (static_key_false(&udp_encap_needed) && up->encap_type) { @@ -2369,9 +2373,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: if (val != 0) { - up->corkflag = 1; + WRITE_ONCE(up->corkflag, 1); } else { - up->corkflag = 0; + WRITE_ONCE(up->corkflag, 0); lock_sock(sk); push_pending_frames(sk); release_sock(sk); @@ -2478,7 +2482,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: - val = up->corkflag; + val = READ_ONCE(up->corkflag); break; case UDP_ENCAP: @@ -2569,10 +2573,17 @@ int udp_abort(struct sock *sk, int err) { lock_sock(sk); + /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing + * with close() + */ + if (sock_flag(sk, SOCK_DEAD)) + goto out; + sk->sk_err = err; sk->sk_error_report(sk); __udp_disconnect(sk, 0); +out: release_sock(sk); return 0; @@ -2766,7 +2777,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_puts(seq, " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index cde2719fcb89..353c83b1b0db 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -254,7 +254,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, struct sock *sk; if (NAPI_GRO_CB(skb)->encap_mark || - (skb->ip_summed != CHECKSUM_PARTIAL && + (uh->check && skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid)) goto out; |