aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c10
-rw-r--r--net/ipv4/cipso_ipv4.c12
-rw-r--r--net/ipv4/devinet.c3
-rw-r--r--net/ipv4/esp4.c7
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c2
-rw-r--r--net/ipv4/gre_offload.c13
-rw-r--r--net/ipv4/icmp.c71
-rw-r--r--net/ipv4/igmp.c3
-rw-r--r--net/ipv4/inet_connection_sock.c93
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/inet_hashtables.c1
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_output.c10
-rw-r--r--net/ipv4/ip_tunnel.c24
-rw-r--r--net/ipv4/ip_vti.c75
-rw-r--r--net/ipv4/ipconfig.c13
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c7
-rw-r--r--net/ipv4/ping.c15
-rw-r--r--net/ipv4/route.c174
-rw-r--r--net/ipv4/syncookies.c9
-rw-r--r--net/ipv4/tcp.c15
-rw-r--r--net/ipv4/tcp_bbr.c4
-rw-r--r--net/ipv4/tcp_cong.c7
-rw-r--r--net/ipv4/tcp_cubic.c7
-rw-r--r--net/ipv4/tcp_input.c37
-rw-r--r--net/ipv4/tcp_ipv4.c24
-rw-r--r--net/ipv4/tcp_output.c33
-rw-r--r--net/ipv4/tcp_recovery.c5
-rw-r--r--net/ipv4/udp.c27
-rw-r--r--net/ipv4/udp_offload.c2
36 files changed, 454 insertions, 267 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f31c09873d0f..d0ef51d49f8a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1883,6 +1883,10 @@ static int __init inet_init(void)
ip_init();
+ /* Initialise per-cpu ipv4 mibs */
+ if (init_ipv4_mibs())
+ panic("%s: Cannot init ipv4 mibs\n", __func__);
+
/* Setup TCP slab cache for open requests. */
tcp_init();
@@ -1911,12 +1915,6 @@ static int __init inet_init(void)
if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
- /*
- * Initialise per-cpu ipv4 mibs
- */
-
- if (init_ipv4_mibs())
- pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 5535b722f66d..e8b8dd1cb157 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -486,6 +486,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
kfree(doi_def->map.std->lvl.local);
kfree(doi_def->map.std->cat.cipso);
kfree(doi_def->map.std->cat.local);
+ kfree(doi_def->map.std);
break;
}
kfree(doi_def);
@@ -533,16 +534,10 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
ret_val = -ENOENT;
goto doi_remove_return;
}
- if (!refcount_dec_and_test(&doi_def->refcount)) {
- spin_unlock(&cipso_v4_doi_list_lock);
- ret_val = -EBUSY;
- goto doi_remove_return;
- }
list_del_rcu(&doi_def->list);
spin_unlock(&cipso_v4_doi_list_lock);
- cipso_v4_cache_invalidate();
- call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+ cipso_v4_doi_putdef(doi_def);
ret_val = 0;
doi_remove_return:
@@ -599,9 +594,6 @@ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
if (!refcount_dec_and_test(&doi_def->refcount))
return;
- spin_lock(&cipso_v4_doi_list_lock);
- list_del_rcu(&doi_def->list);
- spin_unlock(&cipso_v4_doi_list_lock);
cipso_v4_cache_invalidate();
call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 096a28f9720d..d9bb3ae78560 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -262,6 +262,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
err = devinet_sysctl_register(in_dev);
if (err) {
in_dev->dead = 1;
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
in_dev_put(in_dev);
in_dev = NULL;
goto out;
@@ -2323,7 +2324,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
free:
kfree(t);
out:
- return -ENOBUFS;
+ return -ENOMEM;
}
static void __devinet_sysctl_unregister(struct net *net,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index c8e32f167ebb..9e664b6cef13 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -252,7 +252,6 @@ static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struc
int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
- u8 *vaddr;
int nfrags;
int esph_offset;
struct page *page;
@@ -294,14 +293,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
page = pfrag->page;
get_page(page);
- vaddr = kmap_atomic(page);
-
- tail = vaddr + pfrag->offset;
+ tail = page_address(page) + pfrag->offset;
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
- kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ff499000f6cd..ee467d744b07 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_oif = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
- .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+ .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
.flowi4_scope = scope,
.flowi4_mark = vmark ? skb->mark : 0,
};
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index eff703cb13b6..bc233fdfae0f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -839,7 +839,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
if (fl4.flowi4_scope < RT_SCOPE_LINK)
fl4.flowi4_scope = RT_SCOPE_LINK;
- if (cfg->fc_table)
+ if (cfg->fc_table && cfg->fc_table != RT_TABLE_MAIN)
tbl = fib_get_table(net, cfg->fc_table);
if (tbl)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3f9509679f0e..0c8fcc050ad2 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1729,7 +1729,7 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
struct key_vector *local_l = NULL, *local_tp;
- hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ hlist_for_each_entry(fa, &l->leaf, fa_list) {
struct fib_alias *new_fa;
if (local_tb->tb_id != fa->tb_id)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a7d980105f6..095c30863745 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -19,12 +19,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ bool need_csum, need_recompute_csum, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -45,6 +45,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb->protocol = skb->inner_protocol;
need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
+ need_recompute_csum = skb->csum_not_inet;
skb->encap_hdr_csum = need_csum;
features &= skb->dev->hw_enc_features;
@@ -102,7 +103,15 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
*(pcsum + 1) = 0;
- *pcsum = gso_make_checksum(skb, 0);
+ if (need_recompute_csum && !skb_is_gso(skb)) {
+ __wsum csum;
+
+ csum = skb_checksum(skb, gre_offset,
+ skb->len - gre_offset, 0);
+ *pcsum = csum_fold(csum);
+ } else {
+ *pcsum = gso_make_checksum(skb, 0);
+ }
} while ((skb = skb->next));
out:
return segs;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 995ef3d23368..dc99b40da48d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -244,7 +244,7 @@ static struct {
/**
* icmp_global_allow - Are we allowed to send one more ICMP message ?
*
- * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
* Returns false if we reached the limit and can not send another packet.
* Note: called with BH disabled
*/
@@ -272,7 +272,10 @@ bool icmp_global_allow(void)
}
credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
if (credit) {
- credit--;
+ /* We want to use a credit of one in average, but need to randomize
+ * it for security reasons.
+ */
+ credit = max_t(int, credit - prandom_u32_max(3), 0);
rc = true;
}
WRITE_ONCE(icmp_global.credit, credit);
@@ -465,6 +468,23 @@ out_bh_enable:
local_bh_enable();
}
+/*
+ * The device used for looking up which routing table to use for sending an ICMP
+ * error is preferably the source whenever it is set, which should ensure the
+ * icmp error can be sent to the source host, else lookup using the routing
+ * table of the destination device, else use the main routing table (index 0).
+ */
+static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
+{
+ struct net_device *route_lookup_dev = NULL;
+
+ if (skb->dev)
+ route_lookup_dev = skb->dev;
+ else if (skb_dst(skb))
+ route_lookup_dev = skb_dst(skb)->dev;
+ return route_lookup_dev;
+}
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -473,6 +493,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
int type, int code,
struct icmp_bxm *param)
{
+ struct net_device *route_lookup_dev;
struct rtable *rt, *rt2;
struct flowi4 fl4_dec;
int err;
@@ -487,7 +508,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
- fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
+ route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
+ fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
rt = ip_route_output_key_hash(net, fl4, skb_in);
@@ -511,7 +533,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev,
+ if (inet_addr_type_dev_table(net, route_lookup_dev,
fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, &fl4_dec);
if (IS_ERR(rt2))
@@ -740,6 +762,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);
+ /* if we don't have a source address at this point, fall back to the
+ * dummy address instead of sending out a packet with a source address
+ * of 0.0.0.0
+ */
+ if (!fl4.saddr)
+ fl4.saddr = htonl(INADDR_DUMMY);
+
icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
@@ -751,6 +780,40 @@ out:;
}
EXPORT_SYMBOL(__icmp_send);
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ struct ip_options opts = { 0 };
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ __be32 orig_ip;
+
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(ct->status & IPS_SRC_NAT)) {
+ __icmp_send(skb_in, type, code, info, &opts);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct iphdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct iphdr))))
+ goto out;
+
+ orig_ip = ip_hdr(skb_in)->saddr;
+ ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+ __icmp_send(skb_in, type, code, info, &opts);
+ ip_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmp_ndo_send);
+#endif
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b6f0ee01f2e0..eac0013809c9 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1790,6 +1790,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
+ ip_mc_clear_src(i);
ip_ma_put(i);
}
}
@@ -2694,6 +2695,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u
rv = 1;
} else if (im) {
if (src_addr) {
+ spin_lock_bh(&im->lock);
for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
@@ -2704,6 +2706,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u
im->sfcount[MCAST_EXCLUDE];
else
rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ spin_unlock_bh(&im->lock);
} else
rv = 1; /* unspecified source; tentatively allow */
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7826fba34b14..08ba0f91f2ab 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -276,51 +276,12 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
ipv6_only_sock(sk), true);
}
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
- */
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
+ struct sock *sk)
{
- bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
- struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int ret = 1, port = snum;
- struct inet_bind_hashbucket *head;
- struct net *net = sock_net(sk);
- struct inet_bind_bucket *tb = NULL;
kuid_t uid = sock_i_uid(sk);
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
- if (!port) {
- head = inet_csk_find_open_port(sk, &tb, &port);
- if (!head)
- return ret;
- if (!tb)
- goto tb_not_found;
- goto success;
- }
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock_bh(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
- goto tb_found;
-tb_not_found:
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
- if (!tb)
- goto fail_unlock;
-tb_found:
- if (!hlist_empty(&tb->owners)) {
- if (sk->sk_reuse == SK_FORCE_REUSE)
- goto success;
-
- if ((tb->fastreuse > 0 && reuse) ||
- sk_reuseport_match(tb, sk))
- goto success;
- if (inet_csk_bind_conflict(sk, tb, true, true))
- goto fail_unlock;
- }
-success:
if (hlist_empty(&tb->owners)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
@@ -364,6 +325,54 @@ success:
tb->fastreuseport = 0;
}
}
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ int ret = 1, port = snum;
+ struct inet_bind_hashbucket *head;
+ struct net *net = sock_net(sk);
+ struct inet_bind_bucket *tb = NULL;
+
+ if (!port) {
+ head = inet_csk_find_open_port(sk, &tb, &port);
+ if (!head)
+ return ret;
+ if (!tb)
+ goto tb_not_found;
+ goto success;
+ }
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port)
+ goto tb_found;
+tb_not_found:
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb)
+ goto fail_unlock;
+tb_found:
+ if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse == SK_FORCE_REUSE)
+ goto success;
+
+ if ((tb->fastreuse > 0 && reuse) ||
+ sk_reuseport_match(tb, sk))
+ goto success;
+ if (inet_csk_bind_conflict(sk, tb, true, true))
+ goto fail_unlock;
+ }
+success:
+ inet_csk_update_fastreuse(tb, sk);
+
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7ba013d6c00a..5fac8d776391 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -393,8 +393,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_inode = 0;
if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- inet_rsk(reqsk)->ir_mark))
+ inet_rsk(reqsk)->ir_mark)) {
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
+ }
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 0af13f5bdc9a..8a54babf5c90 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -160,6 +160,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
return -ENOMEM;
}
}
+ inet_csk_update_fastreuse(tb, child);
}
inet_bind_hash(child, tb, port);
spin_unlock(&head->lock);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9940a59306b5..1a4d89f8361c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -443,6 +443,8 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
+ if (csum && skb_checksum_start(skb) < skb->data)
+ return -EINVAL;
return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 73cd64c7692f..c9f82525bfa4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -73,6 +73,7 @@
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
+#include <net/inet_ecn.h>
#include <net/lwtunnel.h>
#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
@@ -311,7 +312,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
- if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ if (skb->len > mtu || IPCB(skb)->frag_max_size)
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb);
@@ -418,8 +419,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
- memcpy(&iph->saddr, &fl4->saddr,
- sizeof(fl4->saddr) + sizeof(fl4->daddr));
+
+ iph->saddr = fl4->saddr;
+ iph->daddr = fl4->daddr;
}
/* Note: skb->sk can be different from sk, in case of tunnels */
@@ -1562,7 +1564,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
if (IS_ERR(rt))
return;
- inet_sk(sk)->tos = arg->tos;
+ inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index f6793017a20d..e9cf0d185459 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -98,9 +98,10 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
__be32 remote, __be32 local,
__be32 key)
{
- unsigned int hash;
struct ip_tunnel *t, *cand = NULL;
struct hlist_head *head;
+ struct net_device *ndev;
+ unsigned int hash;
hash = ip_tunnel_hash(key, remote);
head = &itn->tunnels[hash];
@@ -175,8 +176,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
if (t && t->dev->flags & IFF_UP)
return t;
- if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
- return netdev_priv(itn->fb_tunnel_dev);
+ ndev = READ_ONCE(itn->fb_tunnel_dev);
+ if (ndev && ndev->flags & IFF_UP)
+ return netdev_priv(ndev);
return NULL;
}
@@ -750,7 +752,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
+ df = tnl_params->frag_off;
+ if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+ df |= (inner_iph->frag_off & htons(IP_DF));
+
+ if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) {
ip_rt_put(rt);
goto tx_error;
}
@@ -778,10 +784,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ttl = ip4_dst_hoplimit(&rt->dst);
}
- df = tnl_params->frag_off;
- if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
- df |= (inner_iph->frag_off&htons(IP_DF));
-
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
if (max_headroom > dev->needed_headroom)
@@ -1211,9 +1213,9 @@ void ip_tunnel_uninit(struct net_device *dev)
struct ip_tunnel_net *itn;
itn = net_generic(net, tunnel->ip_tnl_net_id);
- /* fb_tunnel_dev will be unregisted in net-exit call. */
- if (itn->fb_tunnel_dev != dev)
- ip_tunnel_del(itn, netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
+ if (itn->fb_tunnel_dev == dev)
+ WRITE_ONCE(itn->fb_tunnel_dev, NULL);
dst_cache_reset(&tunnel->dst_cache);
}
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index c1693d75e196..33a85269a9f2 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -50,7 +50,7 @@ static unsigned int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);
static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
- int encap_type)
+ int encap_type, bool update_skb_dev)
{
struct ip_tunnel *tunnel;
const struct iphdr *iph = ip_hdr(skb);
@@ -65,6 +65,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+ if (update_skb_dev)
+ skb->dev = tunnel->dev;
+
return xfrm_input(skb, nexthdr, spi, encap_type);
}
@@ -74,25 +77,43 @@ drop:
return 0;
}
-static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi,
- int encap_type)
+static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
{
- struct ip_tunnel *tunnel;
+ return vti_input(skb, nexthdr, spi, encap_type, false);
+}
+
+static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev);
+}
+
+static int vti_rcv_proto(struct sk_buff *skb)
+{
+ return vti_rcv(skb, 0, false);
+}
+
+static int vti_rcv_tunnel(struct sk_buff *skb)
+{
+ struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id);
const struct iphdr *iph = ip_hdr(skb);
- struct net *net = dev_net(skb->dev);
- struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ struct ip_tunnel *tunnel;
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
iph->saddr, iph->daddr, 0);
if (tunnel) {
+ struct tnl_ptk_info tpi = {
+ .proto = htons(ETH_P_IP),
+ };
+
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
-
- XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
-
- skb->dev = tunnel->dev;
-
- return xfrm_input(skb, nexthdr, spi, encap_type);
+ if (iptunnel_pull_header(skb, 0, tpi.proto, false))
+ goto drop;
+ return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false);
}
return -EINVAL;
@@ -101,22 +122,6 @@ drop:
return 0;
}
-static int vti_rcv(struct sk_buff *skb)
-{
- XFRM_SPI_SKB_CB(skb)->family = AF_INET;
- XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
-
- return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
-}
-
-static int vti_rcv_ipip(struct sk_buff *skb)
-{
- XFRM_SPI_SKB_CB(skb)->family = AF_INET;
- XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
-
- return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0);
-}
-
static int vti_rcv_cb(struct sk_buff *skb, int err)
{
unsigned short family;
@@ -482,31 +487,31 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
}
static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
- .handler = vti_rcv,
- .input_handler = vti_input,
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
.cb_handler = vti_rcv_cb,
.err_handler = vti4_err,
.priority = 100,
};
static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
- .handler = vti_rcv,
- .input_handler = vti_input,
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
.cb_handler = vti_rcv_cb,
.err_handler = vti4_err,
.priority = 100,
};
static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
- .handler = vti_rcv,
- .input_handler = vti_input,
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
.cb_handler = vti_rcv_cb,
.err_handler = vti4_err,
.priority = 100,
};
static struct xfrm_tunnel ipip_handler __read_mostly = {
- .handler = vti_rcv_ipip,
+ .handler = vti_rcv_tunnel,
.err_handler = vti4_err,
.priority = 0,
};
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f0782c91514c..41e384834d50 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -881,7 +881,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
/*
- * Copy BOOTP-supplied string if not already set.
+ * Copy BOOTP-supplied string
*/
static int __init ic_bootp_string(char *dest, char *src, int len, int max)
{
@@ -930,12 +930,15 @@ static void __init ic_do_bootp_ext(u8 *ext)
}
break;
case 12: /* Host name */
- ic_bootp_string(utsname()->nodename, ext+1, *ext,
- __NEW_UTS_LEN);
- ic_host_name_set = 1;
+ if (!ic_host_name_set) {
+ ic_bootp_string(utsname()->nodename, ext+1, *ext,
+ __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ }
break;
case 15: /* Domain name (DNS) */
- ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+ if (!ic_domain[0])
+ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
break;
case 17: /* Root path */
if (!root_server_path[0])
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index cdd627355ed1..df610245d21a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -702,7 +702,7 @@ out:
rtnl_link_failed:
#if IS_ENABLED(CONFIG_MPLS)
- xfrm4_tunnel_deregister(&mplsip_handler, AF_INET);
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
xfrm_tunnel_mplsip_failed:
#endif
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 6dd727e0a72f..13a46066546d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1196,6 +1196,8 @@ static int translate_compat_table(struct net *net,
if (!newinfo)
goto out_unlock;
+ memset(newinfo->entries, 0, size);
+
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
newinfo->hook_entry[i] = compatr->hook_entry[i];
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 0d1a2cda1bfb..a4039ac65b54 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1432,6 +1432,8 @@ translate_compat_table(struct net *net,
if (!newinfo)
goto out_unlock;
+ memset(newinfo->entries, 0, size);
+
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = compatr->hook_entry[i];
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 341d1bd637af..6b8908e1e0ba 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -94,7 +94,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_oif = 0;
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- flow.flowi4_tos = RT_TOS(iph->tos);
+ flow.flowi4_tos = iph->tos & IPTOS_RT_MASK;
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par));
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 8a69363b4884..c66103de86bd 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -165,8 +165,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
break;
default:
pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
- pptp_msg_name[0]);
+ pptp_msg_name(msg));
/* fall through */
case PPTP_SET_LINK_INFO:
/* only need to NAT in case PAC is behind NAT box */
@@ -267,9 +266,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
break;
default:
- pr_debug("unknown inbound packet %s\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
- pptp_msg_name[0]);
+ pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg));
/* fall through */
case PPTP_START_SESSION_REQUEST:
case PPTP_START_SESSION_REPLY:
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 16226d49263d..aab141c4a389 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -801,6 +801,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
sk->sk_uid);
+ fl4.fl4_icmp_type = user_icmph.type;
+ fl4.fl4_icmp_code = user_icmph.code;
+
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
@@ -975,6 +978,7 @@ bool ping_rcv(struct sk_buff *skb)
struct sock *sk;
struct net *net = dev_net(skb->dev);
struct icmphdr *icmph = icmp_hdr(skb);
+ bool rc = false;
/* We assume the packet has already been checked by icmp_rcv */
@@ -989,14 +993,15 @@ bool ping_rcv(struct sk_buff *skb)
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
pr_debug("rcv on socket %p\n", sk);
- if (skb2)
- ping_queue_rcv_skb(sk, skb2);
+ if (skb2 && !ping_queue_rcv_skb(sk, skb2))
+ rc = true;
sock_put(sk);
- return true;
}
- pr_debug("no socket, dropping\n");
- return false;
+ if (!rc)
+ pr_debug("no socket, dropping\n");
+
+ return rc;
}
EXPORT_SYMBOL_GPL(ping_rcv);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 967acf2844ef..34cf572cc5dc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -70,6 +70,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -276,6 +277,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu+1;
return &per_cpu(rt_cache_stat, cpu);
}
+ (*pos)++;
return NULL;
}
@@ -484,8 +486,10 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
-#define IP_IDENTS_SZ 2048u
-
+/* Hash tables of size 2048..262144 depending on RAM size.
+ * Each bucket uses 8 bytes.
+ */
+static u32 ip_idents_mask __read_mostly;
static atomic_t *ip_idents __read_mostly;
static u32 *ip_tstamps __read_mostly;
@@ -495,22 +499,24 @@ static u32 *ip_tstamps __read_mostly;
*/
u32 ip_idents_reserve(u32 hash, int segs)
{
- u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
- atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = ACCESS_ONCE(*p_tstamp);
- u32 now = (u32)jiffies;
- u32 new, delta = 0;
+ u32 bucket, old, now = (u32)jiffies;
+ atomic_t *p_id;
+ u32 *p_tstamp;
+ u32 delta = 0;
+
+ bucket = hash & ip_idents_mask;
+ p_tstamp = ip_tstamps + bucket;
+ p_id = ip_idents + bucket;
+ old = ACCESS_ONCE(*p_tstamp);
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- /* Do not use atomic_add_return() as it makes UBSAN unhappy */
- do {
- old = (u32)atomic_read(p_id);
- new = old + delta + segs;
- } while (atomic_cmpxchg(p_id, old, new) != old);
-
- return new - segs;
+ /* If UBSAN reports an error there, please make sure your compiler
+ * supports -fno-strict-overflow before reporting it that was a bug
+ * in UBSAN, and it has been fixed in GCC-8.
+ */
+ return atomic_add_return(segs + delta, p_id) - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
@@ -613,28 +619,35 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
}
}
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
{
- struct fib_nh_exception *fnhe, *oldest;
+ struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
+ struct fib_nh_exception *fnhe, *oldest = NULL;
- oldest = rcu_dereference(hash->chain);
- for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
- fnhe = rcu_dereference(fnhe->fnhe_next)) {
- if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
+ fnhe = rcu_dereference_protected(*fnhe_p,
+ lockdep_is_held(&fnhe_lock));
+ if (!fnhe)
+ break;
+ if (!oldest ||
+ time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
oldest = fnhe;
+ oldest_p = fnhe_p;
+ }
}
fnhe_flush_routes(oldest);
- return oldest;
+ *oldest_p = oldest->fnhe_next;
+ kfree_rcu(oldest, rcu);
}
-static inline u32 fnhe_hashfun(__be32 daddr)
+static u32 fnhe_hashfun(__be32 daddr)
{
- static u32 fnhe_hashrnd __read_mostly;
- u32 hval;
+ static siphash_key_t fnhe_hash_key __read_mostly;
+ u64 hval;
- net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
- hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
- return hash_32(hval, FNHE_HASH_SHIFT);
+ net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
+ hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
+ return hash_64(hval, FNHE_HASH_SHIFT);
}
static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
@@ -701,16 +714,21 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
if (rt)
fill_route_from_fnhe(rt, fnhe);
} else {
- if (depth > FNHE_RECLAIM_DEPTH)
- fnhe = fnhe_oldest(hash);
- else {
- fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
- if (!fnhe)
- goto out_unlock;
-
- fnhe->fnhe_next = hash->chain;
- rcu_assign_pointer(hash->chain, fnhe);
+ /* Randomize max depth to avoid some side channels attacks. */
+ int max_depth = FNHE_RECLAIM_DEPTH +
+ prandom_u32_max(FNHE_RECLAIM_DEPTH);
+
+ while (depth > max_depth) {
+ fnhe_remove_oldest(hash);
+ depth--;
}
+
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
+
+ fnhe->fnhe_next = hash->chain;
+
fnhe->fnhe_genid = genid;
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
@@ -718,6 +736,8 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_mtu_locked = lock;
fnhe->fnhe_expires = max(1UL, expires);
+ rcu_assign_pointer(hash->chain, fnhe);
+
/* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception
* applies to them.
@@ -796,6 +816,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
if (fib_lookup(net, fl4, &res, 0) == 0) {
struct fib_nh *nh = &FIB_RES_NH(res);
+ fib_select_path(net, &res, fl4, skb);
+ nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, new_gw,
0, false,
jiffies + ip_rt_gc_timeout);
@@ -1012,6 +1034,7 @@ out: kfree_skb(skb);
static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct dst_entry *dst = &rt->dst;
+ struct net *net = dev_net(dst->dev);
u32 old_mtu = ipv4_mtu(dst);
struct fib_result res;
bool lock = false;
@@ -1032,9 +1055,11 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
- struct fib_nh *nh = &FIB_RES_NH(res);
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
+ struct fib_nh *nh;
+ fib_select_path(net, &res, fl4, NULL);
+ nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
jiffies + ip_rt_mtu_expires);
}
@@ -1433,6 +1458,24 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
return ret;
}
+struct uncached_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
+
+static void rt_add_uncached_list(struct rtable *rt)
+{
+ struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
+
+ rt->rt_uncached_list = ul;
+
+ spin_lock_bh(&ul->lock);
+ list_add_tail(&rt->rt_uncached, &ul->head);
+ spin_unlock_bh(&ul->lock);
+}
+
static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
{
struct rtable *orig, *prev, **p;
@@ -1452,7 +1495,7 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
if (orig) {
- dst_dev_put(&orig->dst);
+ rt_add_uncached_list(orig);
dst_release(&orig->dst);
}
} else {
@@ -1463,24 +1506,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
return ret;
}
-struct uncached_list {
- spinlock_t lock;
- struct list_head head;
-};
-
-static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
-
-static void rt_add_uncached_list(struct rtable *rt)
-{
- struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
-
- rt->rt_uncached_list = ul;
-
- spin_lock_bh(&ul->lock);
- list_add_tail(&rt->rt_uncached, &ul->head);
- spin_unlock_bh(&ul->lock);
-}
-
static void ipv4_dst_destroy(struct dst_entry *dst)
{
struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
@@ -2507,8 +2532,6 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
fib_select_path(net, res, fl4, skb);
dev_out = FIB_RES_DEV(*res);
- fl4->flowi4_oif = dev_out->ifindex;
-
make_route:
rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
@@ -2601,10 +2624,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
if (IS_ERR(rt))
return rt;
- if (flp4->flowi4_proto)
+ if (flp4->flowi4_proto) {
+ flp4->flowi4_oif = rt->dst.dev->ifindex;
rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
flowi4_to_flowi(flp4),
sk, 0);
+ }
return rt;
}
@@ -2784,7 +2809,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = rtm->rtm_tos;
+ fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
@@ -2803,8 +2828,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
- dev, &res);
+ err = ip_route_input_rcu(skb, dst, src,
+ rtm->rtm_tos & IPTOS_RT_MASK, dev,
+ &res);
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
@@ -3093,18 +3119,26 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
int __init ip_rt_init(void)
{
+ void *idents_hash;
int rc = 0;
int cpu;
- ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
- if (!ip_idents)
- panic("IP: failed to allocate ip_idents\n");
+ /* For modern hosts, this will use 2 MB of memory */
+ idents_hash = alloc_large_system_hash("IP idents",
+ sizeof(*ip_idents) + sizeof(*ip_tstamps),
+ 0,
+ 16, /* one bucket per 64 KB */
+ HASH_ZERO,
+ NULL,
+ &ip_idents_mask,
+ 2048,
+ 256*1024);
+
+ ip_idents = idents_hash;
- prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
- ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
- if (!ip_tstamps)
- panic("IP: failed to allocate ip_tstamps\n");
+ ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 2f871424925e..bf8df824bae3 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -296,7 +296,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
__u32 cookie = ntohl(th->ack_seq) - 1;
struct sock *ret = sk;
struct request_sock *req;
- int mss;
+ int full_space, mss;
struct rtable *rt;
__u8 rcv_wscale;
struct flowi4 fl4;
@@ -389,8 +389,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* Try to redo what tcp_v4_send_synack did. */
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
- tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ tcp_select_initial_window(full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6fbfdd5e96de..c9f6f28e54f3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2366,6 +2366,9 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
tp->delivered = 0;
+ if (icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
@@ -2759,10 +2762,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
case TCP_MD5SIG_EXT:
- if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
- err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
- else
- err = -EINVAL;
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
case TCP_USER_TIMEOUT:
@@ -3394,10 +3394,13 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data);
int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
{
+ u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
struct scatterlist sg;
- sg_init_one(&sg, key->key, key->keylen);
- ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
+ sg_init_one(&sg, key->key, keylen);
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
+
+ /* tcp_md5_do_add() might change key->key under us */
return crypto_ahash_update(hp->md5_req);
}
EXPORT_SYMBOL(tcp_md5_hash_key);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 434ad1e72447..76a9652d90f2 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -769,7 +769,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
filter_expired = after(tcp_jiffies32,
bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
if (rs->rtt_us >= 0 &&
- (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+ (rs->rtt_us < bbr->min_rtt_us || filter_expired)) {
bbr->min_rtt_us = rs->rtt_us;
bbr->min_rtt_stamp = tcp_jiffies32;
}
@@ -840,7 +840,7 @@ static void bbr_init(struct sock *sk)
bbr->prior_cwnd = 0;
bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */
bbr->rtt_cnt = 0;
- bbr->next_rtt_delivered = 0;
+ bbr->next_rtt_delivered = tp->delivered;
bbr->prev_ca_state = TCP_CA_Open;
bbr->packet_conservation = 0;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 494e3c3a21a1..b193dcebbf7e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -199,7 +199,12 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_setsockopt = 1;
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
- if (sk->sk_state != TCP_CLOSE)
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
+
+ if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
tcp_init_congestion_control(sk);
}
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 78bfadfcf342..93530bd33247 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -340,8 +340,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
if (tcp_in_slow_start(tp)) {
- if (hystart && after(ack, ca->end_seq))
- bictcp_hystart_reset(sk);
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
@@ -383,6 +381,9 @@ static void hystart_update(struct sock *sk, u32 delay)
if (ca->found & hystart_detect)
return;
+ if (after(tp->snd_una, ca->end_seq))
+ bictcp_hystart_reset(sk);
+
if (hystart_detect & HYSTART_ACK_TRAIN) {
u32 now = bictcp_clock();
@@ -403,6 +404,8 @@ static void hystart_update(struct sock *sk, u32 delay)
if (hystart_detect & HYSTART_DELAY) {
/* obtain the minimum delay of more than sampling packets */
+ if (ca->curr_rtt > delay)
+ ca->curr_rtt = delay;
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
ca->curr_rtt = delay;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 29f3df4ddd1f..9382caeb721a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1225,7 +1225,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (tp->undo_marker && tp->undo_retrans > 0 &&
after(end_seq, tp->undo_marker))
- tp->undo_retrans--;
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
if (sacked & TCPCB_SACKED_ACKED)
state->reord = min(fack_count, state->reord);
}
@@ -2803,7 +2803,8 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
u32 prior_retrans = tp->retrans_out;
- tcp_rack_mark_lost(sk);
+ if (tcp_rack_mark_lost(sk))
+ *ack_flag &= ~FLAG_SET_XMIT_TIMER;
if (prior_retrans > tp->retrans_out)
*ack_flag |= FLAG_LOST_RETRANS;
}
@@ -3516,10 +3517,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
}
}
-/* This routine deals with acks during a TLP episode.
- * We mark the end of a TLP episode on receiving TLP dupack or when
- * ack is after tlp_high_seq.
- * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+/* This routine deals with acks during a TLP episode and ends an episode by
+ * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
*/
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
@@ -3528,7 +3527,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
if (before(ack, tp->tlp_high_seq))
return;
- if (flag & FLAG_DSACKING_ACK) {
+ if (!tp->tlp_retrans) {
+ /* TLP of new data has been acknowledged */
+ tp->tlp_high_seq = 0;
+ } else if (flag & FLAG_DSACKING_ACK) {
/* This DSACK means original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0;
} else if (after(ack, tp->tlp_high_seq)) {
@@ -3687,15 +3689,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- /* If needed, reset TLP/RTO timer; RACK may later override this. */
- if (flag & FLAG_SET_XMIT_TIMER)
- tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
}
+ /* If needed, reset TLP/RTO timer when RACK doesn't set. */
+ if (flag & FLAG_SET_XMIT_TIMER)
+ tcp_set_xmit_timer(sk);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
@@ -4507,7 +4510,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
skb, &fragstolen)) {
coalesce_done:
- tcp_grow_window(sk, skb);
+ /* For non sack flows, do not grow window to force DUPACK
+ * and trigger fast retransmit.
+ */
+ if (tcp_is_sack(tp))
+ tcp_grow_window(sk, skb);
kfree_skb_partial(skb, fragstolen);
skb = NULL;
goto add_sack;
@@ -4591,7 +4598,11 @@ add_sack:
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb) {
- tcp_grow_window(sk, skb);
+ /* For non sack flows, do not grow window to force DUPACK
+ * and trigger fast retransmit.
+ */
+ if (tcp_is_sack(tp))
+ tcp_grow_window(sk, skb);
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
@@ -5530,6 +5541,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
+ } else {
+ tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
}
__tcp_ack_snd_check(sk, 0);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b4f0fc34b0ed..744479c8b91f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -285,7 +285,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
- mtu = tcp_sk(sk)->mtu_info;
+ mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -453,7 +453,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (sk->sk_state == TCP_LISTEN)
goto out;
- tp->mtu_info = info;
+ WRITE_ONCE(tp->mtu_info, info);
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
@@ -995,9 +995,18 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
if (key) {
- /* Pre-existing entry - just update that one. */
+ /* Pre-existing entry - just update that one.
+ * Note that the key might be used concurrently.
+ */
memcpy(key->key, newkey, newkeylen);
- key->keylen = newkeylen;
+
+ /* Pairs with READ_ONCE() in tcp_md5_hash_key().
+ * Also note that a reader could catch new key->keylen value
+ * but old key->key[], this is the reason we use __GFP_ZERO
+ * at sock_kmalloc() time below these lines.
+ */
+ WRITE_ONCE(key->keylen, newkeylen);
+
return 0;
}
@@ -1013,7 +1022,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
rcu_assign_pointer(tp->md5sig_info, md5sig);
}
- key = sock_kmalloc(sk, sizeof(*key), gfp);
+ key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
if (!key)
return -ENOMEM;
if (!tcp_alloc_md5sig_pool()) {
@@ -2078,6 +2087,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
static void *tcp_seek_last_pos(struct seq_file *seq)
{
struct tcp_iter_state *st = seq->private;
+ int bucket = st->bucket;
int offset = st->offset;
int orig_num = st->num;
void *rc = NULL;
@@ -2088,7 +2098,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
break;
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_next(seq, NULL);
- while (offset-- && rc)
+ while (offset-- && rc && bucket == st->bucket)
rc = listening_get_next(seq, rc);
if (rc)
break;
@@ -2099,7 +2109,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
rc = established_get_first(seq);
- while (offset-- && rc)
+ while (offset-- && rc && bucket == st->bucket)
rc = established_get_next(seq, rc);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e1eb56e21dd5..99ab936a9cd3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -616,7 +616,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -631,7 +632,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
* rather than TS in order to fit in better with old,
* buggy kernels, but that was deemed to be unnecessary.
*/
- ireq->tstamp_ok &= !ireq->sack_ok;
+ if (synack_type != TCP_SYNACK_COOKIE)
+ ireq->tstamp_ok &= !ireq->sack_ok;
}
#endif
@@ -1468,6 +1470,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
return __tcp_mtu_to_mss(sk, pmtu) -
(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
}
+EXPORT_SYMBOL(tcp_mtu_to_mss);
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
@@ -1618,7 +1621,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
* window, and remember whether we were cwnd-limited then.
*/
if (!before(tp->snd_una, tp->max_packets_seq) ||
- tp->packets_out > tp->max_packets_out) {
+ tp->packets_out > tp->max_packets_out ||
+ is_cwnd_limited) {
tp->max_packets_out = tp->packets_out;
tp->max_packets_seq = tp->snd_nxt;
tp->is_cwnd_limited = is_cwnd_limited;
@@ -2409,6 +2413,10 @@ repair:
else
tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
+ if (likely(sent_pkts || is_cwnd_limited))
+ tcp_cwnd_validate(sk, is_cwnd_limited);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -2416,8 +2424,6 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk, false);
- is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
- tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
return !tp->packets_out && tcp_send_head(sk);
@@ -2498,6 +2504,11 @@ void tcp_send_loss_probe(struct sock *sk)
int pcount;
int mss = tcp_current_mss(sk);
+ /* At most one outstanding TLP */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ tp->tlp_retrans = 0;
skb = tcp_send_head(sk);
if (skb) {
if (tcp_snd_wnd_test(tp, skb, mss)) {
@@ -2520,10 +2531,6 @@ void tcp_send_loss_probe(struct sock *sk)
return;
}
- /* At most one outstanding TLP retransmission. */
- if (tp->tlp_high_seq)
- goto rearm_timer;
-
if (skb_still_in_host_queue(sk, skb))
goto rearm_timer;
@@ -2544,10 +2551,12 @@ void tcp_send_loss_probe(struct sock *sk)
if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;
+ tp->tlp_retrans = 1;
+
+probe_sent:
/* Record snd_nxt for loss detection. */
tp->tlp_high_seq = tp->snd_nxt;
-probe_sent:
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
inet_csk(sk)->icsk_pending = 0;
@@ -3252,8 +3261,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
- tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
- sizeof(*th);
+ tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5,
+ foc, synack_type) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index be8ef1e5dfef..0409cd2c4918 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -102,13 +102,13 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
}
}
-void tcp_rack_mark_lost(struct sock *sk)
+bool tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout;
if (!tp->rack.advanced)
- return;
+ return false;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
@@ -118,6 +118,7 @@ void tcp_rack_mark_lost(struct sock *sk)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
+ return !!timeout;
}
/* Record the most recently (re)sent time among the (s)acked packets
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e33258d69246..fee1cdcc224e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -882,7 +882,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
__be16 dport;
u8 tos;
int err, is_udplite = IS_UDPLITE(sk);
- int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
struct ip_options_data opt_copy;
@@ -1165,7 +1165,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
}
up->len += size;
- if (!(up->corkflag || (flags&MSG_MORE)))
+ if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE)))
ret = udp_push_pending_frames(sk);
if (!ret)
ret = size;
@@ -1894,7 +1894,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
/*
* MIB statistics other than incrementing the error count are
@@ -2321,7 +2321,8 @@ int udp_v4_early_demux(struct sk_buff *skb)
*/
if (!inet_sk(sk)->inet_daddr && in_dev)
return ip_mc_validate_source(skb, iph->daddr,
- iph->saddr, iph->tos,
+ iph->saddr,
+ iph->tos & IPTOS_RT_MASK,
skb->dev, in_dev, &itag);
}
return 0;
@@ -2336,6 +2337,9 @@ void udp_destroy_sock(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
bool slow = lock_sock_fast(sk);
+
+ /* protects from races with udp_abort() */
+ sock_set_flag(sk, SOCK_DEAD);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
if (static_key_false(&udp_encap_needed) && up->encap_type) {
@@ -2369,9 +2373,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case UDP_CORK:
if (val != 0) {
- up->corkflag = 1;
+ WRITE_ONCE(up->corkflag, 1);
} else {
- up->corkflag = 0;
+ WRITE_ONCE(up->corkflag, 0);
lock_sock(sk);
push_pending_frames(sk);
release_sock(sk);
@@ -2478,7 +2482,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case UDP_CORK:
- val = up->corkflag;
+ val = READ_ONCE(up->corkflag);
break;
case UDP_ENCAP:
@@ -2569,10 +2573,17 @@ int udp_abort(struct sock *sk, int err)
{
lock_sock(sk);
+ /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
+ * with close()
+ */
+ if (sock_flag(sk, SOCK_DEAD))
+ goto out;
+
sk->sk_err = err;
sk->sk_error_report(sk);
__udp_disconnect(sk, 0);
+out:
release_sock(sk);
return 0;
@@ -2766,7 +2777,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
{
seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_puts(seq, " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index cde2719fcb89..353c83b1b0db 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -254,7 +254,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
struct sock *sk;
if (NAPI_GRO_CB(skb)->encap_mark ||
- (skb->ip_summed != CHECKSUM_PARTIAL &&
+ (uh->check && skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid))
goto out;