aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/esp4_offload.c3
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/igmp.c9
-rw-r--r--net/ipv4/inet_connection_sock.c18
-rw-r--r--net/ipv4/inet_hashtables.c12
-rw-r--r--net/ipv4/ip_gre.c21
-rw-r--r--net/ipv4/ip_output.c18
-rw-r--r--net/ipv4/ip_sockglue.c12
-rw-r--r--net/ipv4/ip_tunnel.c12
-rw-r--r--net/ipv4/ip_vti.c4
-rw-r--r--net/ipv4/metrics.c2
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c18
-rw-r--r--net/ipv4/raw.c5
-rw-r--r--net/ipv4/route.c6
-rw-r--r--net/ipv4/tcp.c63
-rw-r--r--net/ipv4/tcp_fastopen.c6
-rw-r--r--net/ipv4/tcp_input.c38
-rw-r--r--net/ipv4/tcp_metrics.c85
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c42
-rw-r--r--net/ipv4/tcp_recovery.c2
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/udp.c9
-rw-r--r--net/ipv4/udp_tunnel.c1
-rw-r--r--net/ipv4/udplite.c10
29 files changed, 288 insertions, 141 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4b26ae525d6d..7c902a1efbbf 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -578,6 +578,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending += writebias;
+ sk->sk_wait_pending++;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -593,6 +594,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
}
remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending -= writebias;
+ sk->sk_wait_pending--;
return timeo;
}
@@ -865,7 +867,7 @@ int inet_shutdown(struct socket *sock, int how)
EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
/* fall through */
default:
- sk->sk_shutdown |= how;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
if (sk->sk_prot->shutdown)
sk->sk_prot->shutdown(sk, how);
break;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 203569500b91..24cd5c9c7839 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -565,7 +565,9 @@ static inline int esp_remove_trailer(struct sk_buff *skb)
skb->csum = csum_block_sub(skb->csum, csumdiff,
skb->len - trimlen);
}
- pskb_trim(skb, skb->len - trimlen);
+ ret = pskb_trim(skb, skb->len - trimlen);
+ if (unlikely(ret))
+ return ret;
ret = nexthdr[1];
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 58834a10c0be..93045373e44b 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -237,6 +237,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
secpath_reset(skb);
+ if (skb_needs_linearize(skb, skb->dev->features) &&
+ __skb_linearize(skb))
+ return -ENOMEM;
return 0;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1885a2fbad86..9aa48b4c4096 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -557,6 +557,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
cfg->fc_scope = RT_SCOPE_UNIVERSE;
}
+ if (!cfg->fc_table)
+ cfg->fc_table = RT_TABLE_MAIN;
+
if (cmd == SIOCDELRT)
return 0;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index aa179e6461e1..af0ddaa55e43 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -759,6 +759,11 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
room = 576;
room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
+ /* Guard against tiny mtu. We need to include at least one
+ * IP network header for this message to make any sense.
+ */
+ if (room <= (int)sizeof(struct iphdr))
+ goto ende;
icmp_param.data_len = skb_in->len - icmp_param.offset;
if (icmp_param.data_len > room)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a08acb54b6b0..5edf426fa414 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -221,8 +221,10 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
int tv = prandom_u32() % max_delay;
im->tm_running = 1;
- if (!mod_timer(&im->timer, jiffies+tv+2))
- refcount_inc(&im->refcnt);
+ if (refcount_inc_not_zero(&im->refcnt)) {
+ if (mod_timer(&im->timer, jiffies + tv + 2))
+ ip_ma_put(im);
+ }
}
static void igmp_gq_start_timer(struct in_device *in_dev)
@@ -357,8 +359,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
struct flowi4 fl4;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
- unsigned int size = mtu;
+ unsigned int size;
+ size = min(mtu, IP_MAX_MTU);
while (1) {
skb = alloc_skb(size + hlen + tlen,
GFP_ATOMIC | __GFP_NOWARN);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0a69f92da71b..7392a744c677 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -826,6 +826,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
if (newsk) {
struct inet_connection_sock *newicsk = inet_csk(newsk);
+ newsk->sk_wait_pending = 0;
inet_sk_set_state(newsk, TCP_SYN_RECV);
newicsk->icsk_bind_hash = NULL;
@@ -903,11 +904,25 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+static int inet_ulp_can_listen(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ulp_ops)
+ return -EINVAL;
+
+ return 0;
+}
+
int inet_csk_listen_start(struct sock *sk, int backlog)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
- int err = -EADDRINUSE;
+ int err;
+
+ err = inet_ulp_can_listen(sk);
+ if (unlikely(err))
+ return err;
reqsk_queue_alloc(&icsk->icsk_accept_queue);
@@ -920,6 +935,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
+ err = -EADDRINUSE;
inet_sk_state_store(sk, TCP_LISTEN);
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3c58019f0718..c6d670cd872f 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -743,17 +743,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
u32 index;
if (port) {
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- inet_ehash_nolisten(sk, NULL, NULL);
- spin_unlock_bh(&head->lock);
- return 0;
- }
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
+ local_bh_disable();
ret = check_established(death_row, sk, port, NULL);
local_bh_enable();
return ret;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c72432ce9bf5..38c8db78cda1 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -440,7 +440,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
/* Push GRE header. */
gre_build_header(skb, tunnel->tun_hlen,
flags, proto, tunnel->parms.o_key,
- (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+ (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
@@ -546,7 +546,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
gre_build_header(skb, tunnel_hlen, flags, proto,
tunnel_id_to_key32(tun_info->key.tun_id),
- (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+ (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
@@ -603,7 +603,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
truncate = true;
}
- nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ nhoff = skb_network_offset(skb);
if (skb->protocol == htons(ETH_P_IP) &&
(ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
truncate = true;
@@ -612,7 +612,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
int thoff;
if (skb_transport_header_was_set(skb))
- thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ thoff = skb_transport_offset(skb);
else
thoff = nhoff + sizeof(struct ipv6hdr);
if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
@@ -635,7 +635,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
}
gre_build_header(skb, 8, TUNNEL_SEQ,
- proto, 0, htonl(tunnel->o_seqno++));
+ proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
@@ -683,15 +683,18 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
}
if (dev->header_ops) {
+ int pull_len = tunnel->hlen + sizeof(struct iphdr);
+
if (skb_cow_head(skb, 0))
goto free_skb;
tnl_params = (const struct iphdr *)skb->data;
- /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
- * to gre header.
- */
- skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ if (!pskb_network_may_pull(skb, pull_len))
+ goto free_skb;
+
+ /* ip_tunnel_xmit() needs skb->data pointing to gre header. */
+ skb_pull(skb, pull_len);
skb_reset_mac_header(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL &&
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 06a981676356..6936f703758b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -221,7 +221,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
- if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+ if (res != LWTUNNEL_XMIT_CONTINUE)
return res;
}
@@ -1443,9 +1443,19 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
cork->dst = NULL;
skb_dst_set(skb, &rt->dst);
- if (iph->protocol == IPPROTO_ICMP)
- icmp_out_count(net, ((struct icmphdr *)
- skb_transport_header(skb))->type);
+ if (iph->protocol == IPPROTO_ICMP) {
+ u8 icmp_type;
+
+ /* For such sockets, transhdrlen is zero when do ip_append_data(),
+ * so icmphdr does not in skb linear region and can not get icmp_type
+ * by icmp_hdr(skb)->type.
+ */
+ if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl)
+ icmp_type = fl4->fl4_icmp_type;
+ else
+ icmp_type = icmp_hdr(skb)->type;
+ icmp_out_count(net, icmp_type);
+ }
ip_cork_release(cork);
out:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 82f341e84fae..fbf39077fc54 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -316,7 +316,14 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
ipc->tos = val;
ipc->priority = rt_tos2priority(ipc->tos);
break;
-
+ case IP_PROTOCOL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->protocol = val;
+ break;
default:
return -EINVAL;
}
@@ -1522,6 +1529,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MINTTL:
val = inet->min_ttl;
break;
+ case IP_PROTOCOL:
+ val = inet_sk(sk)->inet_num;
+ break;
default:
release_sock(sk);
return -ENOPROTOOPT;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 30e93b4f831f..9c2381cf675d 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -609,10 +609,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
else if (skb->protocol == htons(ETH_P_IP))
df = inner_iph->frag_off & htons(IP_DF);
headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
- if (headroom > dev->needed_headroom)
- dev->needed_headroom = headroom;
+ if (headroom > READ_ONCE(dev->needed_headroom))
+ WRITE_ONCE(dev->needed_headroom, headroom);
- if (skb_cow_head(skb, dev->needed_headroom)) {
+ if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
ip_rt_put(rt);
goto tx_dropped;
}
@@ -777,10 +777,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
- if (max_headroom > dev->needed_headroom)
- dev->needed_headroom = max_headroom;
+ if (max_headroom > READ_ONCE(dev->needed_headroom))
+ WRITE_ONCE(dev->needed_headroom, max_headroom);
- if (skb_cow_head(skb, dev->needed_headroom)) {
+ if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
ip_rt_put(rt);
dev->stats.tx_dropped++;
kfree_skb(skb);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 15c71b08c2df..a3536dfe9b16 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -319,12 +319,12 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
switch (skb->protocol) {
case htons(ETH_P_IP):
- xfrm_decode_session(skb, &fl, AF_INET);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ xfrm_decode_session(skb, &fl, AF_INET);
break;
case htons(ETH_P_IPV6):
- xfrm_decode_session(skb, &fl, AF_INET6);
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ xfrm_decode_session(skb, &fl, AF_INET6);
break;
default:
goto tx_err;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 04311f7067e2..9a6b01d85cd0 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -1,4 +1,5 @@
#include <linux/netlink.h>
+#include <linux/nospec.h>
#include <linux/rtnetlink.h>
#include <linux/types.h>
#include <net/ip.h>
@@ -24,6 +25,7 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
if (type > RTAX_MAX)
return -EINVAL;
+ type = array_index_nospec(type, RTAX_MAX + 1);
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index 4824b1e183a1..bff2b85c5fd6 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -96,11 +96,11 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
const struct net_device *indev)
{
- __be32 uninitialized_var(daddr), uninitialized_var(saddr);
- __be16 uninitialized_var(dport), uninitialized_var(sport);
+ __be32 daddr, saddr;
+ __be16 dport, sport;
const struct iphdr *iph = ip_hdr(skb);
struct sk_buff *data_skb = NULL;
- u8 uninitialized_var(protocol);
+ u8 protocol;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
enum ip_conntrack_info ctinfo;
struct nf_conn const *ct;
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index 0af3d8df70dd..157bca240edc 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -16,8 +16,8 @@
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
struct nft_dup_ipv4 {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_dev:8;
+ u8 sreg_addr;
+ u8 sreg_dev;
};
static void nft_dup_ipv4_eval(const struct nft_expr *expr,
@@ -43,16 +43,16 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;
- priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
+ err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ sizeof(struct in_addr));
if (err < 0)
return err;
- if (tb[NFTA_DUP_SREG_DEV] != NULL) {
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
- }
- return 0;
+ if (tb[NFTA_DUP_SREG_DEV])
+ err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV],
+ &priv->sreg_dev, sizeof(int));
+
+ return err;
}
static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 654f586fc0d7..8ad120c07096 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -563,6 +563,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
ipcm_init_sk(&ipc, inet);
+ /* Keep backward compat */
+ if (hdrincl)
+ ipc.protocol = IPPROTO_RAW;
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -630,7 +633,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE,
- hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ hdrincl ? ipc.protocol : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0, sk->sk_uid);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 57e2316529d0..f4d41ceef946 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -791,7 +791,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
goto reject_redirect;
}
- n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
+ n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
if (!n)
n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
if (!IS_ERR(n)) {
@@ -1215,6 +1215,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
+ struct net_device *dev;
struct ip_options opt;
int res;
@@ -1232,7 +1233,8 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb)
opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
- res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
+ dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
+ res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
rcu_read_unlock();
if (res)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 68f89fe7f923..00648a478c6a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -515,6 +515,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
__poll_t mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ u8 shutdown;
int state;
sock_poll_wait(file, sock, wait);
@@ -557,9 +558,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= EPOLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ if (shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
/* Connected or passive Fast Open socket? */
@@ -575,8 +577,8 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tcp_stream_is_readable(tp, target, sk))
mask |= EPOLLIN | EPOLLRDNORM;
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
+ if (!(shutdown & SEND_SHUTDOWN)) {
+ if (__sk_stream_is_writeable(sk, 1)) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -588,7 +590,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* pairs with the input side.
*/
smp_mb__after_atomic();
- if (sk_stream_is_writeable(sk))
+ if (__sk_stream_is_writeable(sk, 1))
mask |= EPOLLOUT | EPOLLWRNORM;
}
} else
@@ -2338,14 +2340,13 @@ bool tcp_check_oom(struct sock *sk, int shift)
return too_many_orphans || out_of_socket_memory;
}
-void tcp_close(struct sock *sk, long timeout)
+void __tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
int data_was_unread = 0;
int state;
- lock_sock(sk);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@@ -2505,6 +2506,12 @@ adjudge_to_death:
out:
bh_unlock_sock(sk);
local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ lock_sock(sk);
+ __tcp_close(sk, timeout);
release_sock(sk);
sock_put(sk);
}
@@ -2561,6 +2568,12 @@ int tcp_disconnect(struct sock *sk, int flags)
int old_state = sk->sk_state;
u32 seq;
+ /* Deny disconnect if other threads are blocked in sk_wait_event()
+ * or inet_wait_for_connect().
+ */
+ if (sk->sk_wait_pending)
+ return -EBUSY;
+
if (old_state != TCP_CLOSE)
tcp_set_state(sk, TCP_CLOSE);
@@ -2593,7 +2606,7 @@ int tcp_disconnect(struct sock *sk, int flags)
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
- sk->sk_shutdown = 0;
+ WRITE_ONCE(sk->sk_shutdown, 0);
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
tp->rcv_rtt_last_tsecr = 0;
@@ -2987,18 +3000,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_LINGER2:
if (val < 0)
- tp->linger2 = -1;
- else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
- tp->linger2 = 0;
+ WRITE_ONCE(tp->linger2, -1);
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+ WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
- tp->linger2 = val * HZ;
+ WRITE_ONCE(tp->linger2, val * HZ);
break;
case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
- icsk->icsk_accept_queue.rskq_defer_accept =
- secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
- TCP_RTO_MAX / HZ);
+ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ));
break;
case TCP_WINDOW_CLAMP:
@@ -3086,7 +3099,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = tcp_repair_set_window(tp, optval, optlen);
break;
case TCP_NOTSENT_LOWAT:
- tp->notsent_lowat = val;
+ WRITE_ONCE(tp->notsent_lowat, val);
sk->sk_write_space(sk);
break;
case TCP_INQ:
@@ -3363,7 +3376,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache;
- if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ if (tp->rx_opt.user_mss &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
if (tp->repair)
val = tp->rx_opt.mss_clamp;
@@ -3387,13 +3401,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
- val = tp->linger2;
+ val = READ_ONCE(tp->linger2);
if (val >= 0)
val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
- TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
+ val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
+ val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
@@ -3539,7 +3554,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_FASTOPEN:
- val = icsk->icsk_accept_queue.fastopenq.max_qlen;
+ val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
break;
case TCP_FASTOPEN_CONNECT:
@@ -3554,7 +3569,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tcp_time_stamp_raw() + tp->tsoffset;
break;
case TCP_NOTSENT_LOWAT:
- val = tp->notsent_lowat;
+ val = READ_ONCE(tp->notsent_lowat);
break;
case TCP_INQ:
val = tp->recvmsg_inq;
@@ -3802,7 +3817,7 @@ void tcp_done(struct sock *sk)
if (req)
reqsk_fastopen_remove(sk, req, false);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_state_change(sk);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f726591de7c7..f7bb78b443fa 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -276,6 +276,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
static bool tcp_fastopen_queue_check(struct sock *sk)
{
struct fastopen_queue *fastopenq;
+ int max_qlen;
/* Make sure the listener has enabled fastopen, and we don't
* exceed the max # of pending TFO requests allowed before trying
@@ -288,10 +289,11 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* temporarily vs a server not supporting Fast Open at all.
*/
fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (fastopenq->max_qlen == 0)
+ max_qlen = READ_ONCE(fastopenq->max_qlen);
+ if (max_qlen == 0)
return false;
- if (fastopenq->qlen >= fastopenq->max_qlen) {
+ if (fastopenq->qlen >= max_qlen) {
struct request_sock *req1;
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 11716780667c..407ad07dc598 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -172,6 +172,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
if (unlikely(len > icsk->icsk_ack.rcv_mss +
MAX_TCP_OPTION_SPACE))
tcp_gro_dev_warn(sk, skb, len);
+ /* If the skb has a len of exactly 1*MSS and has the PSH bit
+ * set then it is likely the end of an application write. So
+ * more data may not be arriving soon, and yet the data sender
+ * may be waiting for an ACK if cwnd-bound or using TX zero
+ * copy. So we set ICSK_ACK_PUSHED here so that
+ * tcp_cleanup_rbuf() will send an ACK immediately if the app
+ * reads all of the data and is not ping-pong. If len > MSS
+ * then this logic does not matter (and does not hurt) because
+ * tcp_cleanup_rbuf() will always ACK immediately if the app
+ * reads data and there is more than an MSS of unACKed data.
+ */
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -216,7 +229,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.quick = quickacks;
}
-void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -224,7 +237,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
-EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
@@ -3429,8 +3441,11 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
u32 *last_oow_ack_time)
{
- if (*last_oow_ack_time) {
- s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
+ /* Paired with the WRITE_ONCE() in this function. */
+ u32 val = READ_ONCE(*last_oow_ack_time);
+
+ if (val) {
+ s32 elapsed = (s32)(tcp_jiffies32 - val);
if (0 <= elapsed &&
elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
@@ -3439,7 +3454,10 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
}
}
- *last_oow_ack_time = tcp_jiffies32;
+ /* Paired with the prior READ_ONCE() and with itself,
+ * as we might be lockless.
+ */
+ WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
return false; /* not rate-limited: go ahead, send dupack now! */
}
@@ -3622,8 +3640,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* then we can probably ignore it.
*/
if (before(ack, prior_snd_una)) {
+ u32 max_window;
+
+ /* do not accept ACK for bytes we never sent. */
+ max_window = min_t(u64, tp->max_window, tp->bytes_acked);
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
- if (before(ack, prior_snd_una - tp->max_window)) {
+ if (before(ack, prior_snd_una - max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
tcp_send_challenge_ack(sk, skb);
return -1;
@@ -4130,7 +4152,7 @@ void tcp_fin(struct sock *sk)
inet_csk_schedule_ack(sk);
- sk->sk_shutdown |= RCV_SHUTDOWN;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
@@ -6209,7 +6231,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
tcp_set_state(sk, TCP_FIN_WAIT2);
- sk->sk_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN);
sk_dst_confirm(sk);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4960e2b6bd7f..60619b1f4acd 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -40,7 +40,7 @@ struct tcp_fastopen_metrics {
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
- possible_net_t tcpm_net;
+ struct net *tcpm_net;
struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
@@ -51,34 +51,38 @@ struct tcp_metrics_block {
struct rcu_head rcu_head;
};
-static inline struct net *tm_net(struct tcp_metrics_block *tm)
+static inline struct net *tm_net(const struct tcp_metrics_block *tm)
{
- return read_pnet(&tm->tcpm_net);
+ /* Paired with the WRITE_ONCE() in tcpm_new() */
+ return READ_ONCE(tm->tcpm_net);
}
static bool tcp_metric_locked(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
- return tm->tcpm_lock & (1 << idx);
+ /* Paired with WRITE_ONCE() in tcpm_suck_dst() */
+ return READ_ONCE(tm->tcpm_lock) & (1 << idx);
}
-static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
- return tm->tcpm_vals[idx];
+ /* Paired with WRITE_ONCE() in tcp_metric_set() */
+ return READ_ONCE(tm->tcpm_vals[idx]);
}
static void tcp_metric_set(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
{
- tm->tcpm_vals[idx] = val;
+ /* Paired with READ_ONCE() in tcp_metric_get() */
+ WRITE_ONCE(tm->tcpm_vals[idx], val);
}
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
- return inetpeer_addr_cmp(a, b) == 0;
+ return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
}
struct tcpm_hash_bucket {
@@ -89,6 +93,7 @@ static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
static unsigned int tcp_metrics_hash_log __read_mostly;
static DEFINE_SPINLOCK(tcp_metrics_lock);
+static DEFINE_SEQLOCK(fastopen_seqlock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
const struct dst_entry *dst,
@@ -97,7 +102,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
u32 msval;
u32 val;
- tm->tcpm_stamp = jiffies;
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
val = 0;
if (dst_metric_locked(dst, RTAX_RTT))
@@ -110,30 +115,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
val |= 1 << TCP_METRIC_CWND;
if (dst_metric_locked(dst, RTAX_REORDERING))
val |= 1 << TCP_METRIC_REORDERING;
- tm->tcpm_lock = val;
+ /* Paired with READ_ONCE() in tcp_metric_locked() */
+ WRITE_ONCE(tm->tcpm_lock, val);
msval = dst_metric_raw(dst, RTAX_RTT);
- tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+ tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);
msval = dst_metric_raw(dst, RTAX_RTTVAR);
- tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
- tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
- tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
- tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ dst_metric_raw(dst, RTAX_SSTHRESH));
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ dst_metric_raw(dst, RTAX_CWND));
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ dst_metric_raw(dst, RTAX_REORDERING));
if (fastopen_clear) {
+ write_seqlock(&fastopen_seqlock);
tm->tcpm_fastopen.mss = 0;
tm->tcpm_fastopen.syn_loss = 0;
tm->tcpm_fastopen.try_exp = 0;
tm->tcpm_fastopen.cookie.exp = false;
tm->tcpm_fastopen.cookie.len = 0;
+ write_sequnlock(&fastopen_seqlock);
}
}
#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
-static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+static void tcpm_check_stamp(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst)
{
- if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ unsigned long limit;
+
+ if (!tm)
+ return;
+ limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
+ if (unlikely(time_after(jiffies, limit)))
tcpm_suck_dst(tm, dst, false);
}
@@ -174,20 +191,23 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
oldest = deref_locked(tcp_metrics_hash[hash].chain);
for (tm = deref_locked(oldest->tcpm_next); tm;
tm = deref_locked(tm->tcpm_next)) {
- if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ if (time_before(READ_ONCE(tm->tcpm_stamp),
+ READ_ONCE(oldest->tcpm_stamp)))
oldest = tm;
}
tm = oldest;
} else {
- tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
if (!tm)
goto out_unlock;
}
- write_pnet(&tm->tcpm_net, net);
+ /* Paired with the READ_ONCE() in tm_net() */
+ WRITE_ONCE(tm->tcpm_net, net);
+
tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr;
- tcpm_suck_dst(tm, dst, true);
+ tcpm_suck_dst(tm, dst, reclaim);
if (likely(!reclaim)) {
tm->tcpm_next = tcp_metrics_hash[hash].chain;
@@ -431,7 +451,7 @@ void tcp_update_metrics(struct sock *sk)
tp->reordering);
}
}
- tm->tcpm_stamp = jiffies;
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
out_unlock:
rcu_read_unlock();
}
@@ -446,11 +466,15 @@ void tcp_init_metrics(struct sock *sk)
u32 val, crtt = 0; /* cached RTT scaled by 8 */
sk_dst_confirm(sk);
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
if (!dst)
goto reset;
rcu_read_lock();
- tm = tcp_get_metrics(sk, dst, true);
+ tm = tcp_get_metrics(sk, dst, false);
if (!tm) {
rcu_read_unlock();
goto reset;
@@ -464,11 +488,6 @@ void tcp_init_metrics(struct sock *sk)
tp->snd_ssthresh = val;
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
tp->snd_ssthresh = tp->snd_cwnd_clamp;
- } else {
- /* ssthresh may have been reduced unnecessarily during.
- * 3WHS. Restore it back to its initial default.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
}
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val)
@@ -544,8 +563,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
return ret;
}
-static DEFINE_SEQLOCK(fastopen_seqlock);
-
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
@@ -652,7 +669,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
}
if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
- jiffies - tm->tcpm_stamp,
+ jiffies - READ_ONCE(tm->tcpm_stamp),
TCP_METRICS_ATTR_PAD) < 0)
goto nla_put_failure;
@@ -663,7 +680,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
if (!nest)
goto nla_put_failure;
for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
- u32 val = tm->tcpm_vals[i];
+ u32 val = tcp_metric_get(tm, i);
if (!val)
continue;
@@ -895,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net)
match = net ? net_eq(tm_net(tm), net) :
!refcount_read(&tm_net(tm)->count);
if (match) {
- *pp = tm->tcpm_next;
+ rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
} else {
pp = &tm->tcpm_next;
@@ -936,7 +953,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
if (addr_same(&tm->tcpm_daddr, &daddr) &&
(!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
net_eq(tm_net(tm), net)) {
- *pp = tm->tcpm_next;
+ rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
found = true;
} else {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0fc238d79b03..bae0199a943b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -582,6 +582,9 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
* validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
*
* We don't need to initialize tmp_opt.sack_ok as we don't use the results
+ *
+ * Note: If @fastopen is true, this can be called from process context.
+ * Otherwise, this is from BH context.
*/
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
@@ -734,7 +737,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
&tcp_rsk(req)->last_oow_ack_time))
req->rsk_ops->send_ack(sk, skb, req);
if (paws_reject)
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
return NULL;
}
@@ -753,7 +756,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* "fourth, check the SYN bit"
*/
if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
goto embryonic_reset;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8962864223b4..670804d4c169 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -164,8 +164,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
}
/* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
- u32 rcv_nxt)
+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -179,7 +178,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
if (unlikely(rcv_nxt != tp->rcv_nxt))
return; /* Special ACK sent by DCTCP to reflect ECN */
- tcp_dec_quickack_mode(sk, pkts);
+ tcp_dec_quickack_mode(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -1091,7 +1090,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_set_hash_from_sk(skb, sk);
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+ skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
@@ -1139,7 +1138,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
icsk->icsk_af_ops->send_check(sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
+ tcp_event_ack_sent(sk, rcv_nxt);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
@@ -2221,6 +2220,18 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk)
+{
+ const struct rb_node *node = sk->tcp_rtx_queue.rb_node;
+
+ /* No skb in the rtx queue. */
+ if (!node)
+ return true;
+
+ /* Only one skb in rtx queue. */
+ return !node->rb_left && !node->rb_right;
+}
+
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* (These limits are doubled for retransmits)
@@ -2243,12 +2254,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
- /* Always send skb if rtx queue is empty.
+ /* Always send skb if rtx queue is empty or has one skb.
* No need to wait for TX completion to call us back,
* after softirq/tasklet schedule.
* This helps when TX completions are delayed too much.
*/
- if (tcp_rtx_queue_empty(sk))
+ if (tcp_rtx_queue_empty_or_single_skb(sk))
return false;
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2450,7 +2461,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- u32 timeout, rto_delta_us;
+ u32 timeout, timeout_us, rto_delta_us;
int early_retrans;
/* Don't do any loss probe on a Fast Open connection before 3WHS
@@ -2474,11 +2485,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
* sample is available then probe after TCP_TIMEOUT_INIT.
*/
if (tp->srtt_us) {
- timeout = usecs_to_jiffies(tp->srtt_us >> 2);
+ timeout_us = tp->srtt_us >> 2;
if (tp->packets_out == 1)
- timeout += TCP_RTO_MIN;
+ timeout_us += tcp_rto_min_us(sk);
else
- timeout += TCP_TIMEOUT_MIN;
+ timeout_us += TCP_TIMEOUT_MIN_US;
+ timeout = usecs_to_jiffies(timeout_us);
} else {
timeout = TCP_TIMEOUT_INIT;
}
@@ -2878,7 +2890,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (skb_still_in_host_queue(sk, skb))
return -EBUSY;
+start:
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+ TCP_SKB_CB(skb)->seq++;
+ goto start;
+ }
if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
WARN_ON_ONCE(1);
return -EINVAL;
@@ -3307,7 +3325,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 61969bb9395c..844ff390f726 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -122,7 +122,7 @@ bool tcp_rack_mark_lost(struct sock *sk)
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, &timeout);
if (timeout) {
- timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
+ timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 84069db0423a..d8d28ba169b4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -549,7 +549,9 @@ out_reset_timer:
tcp_stream_is_thin(tp) &&
icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
icsk->icsk_backoff = 0;
- icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ icsk->icsk_rto = clamp(__tcp_set_rto(tp),
+ tcp_rto_min(sk),
+ TCP_RTO_MAX);
} else {
/* Use normal (exponential) backoff */
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dd3fa0ff84e7..a6048cc7fc35 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1458,7 +1458,7 @@ drop:
}
EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
-void udp_destruct_sock(struct sock *sk)
+void udp_destruct_common(struct sock *sk)
{
/* reclaim completely the forward allocated memory */
struct udp_sock *up = udp_sk(sk);
@@ -1471,10 +1471,14 @@ void udp_destruct_sock(struct sock *sk)
kfree_skb(skb);
}
udp_rmem_release(sk, total, 0, true);
+}
+EXPORT_SYMBOL_GPL(udp_destruct_common);
+static void udp_destruct_sock(struct sock *sk)
+{
+ udp_destruct_common(sk);
inet_sock_destruct(sk);
}
-EXPORT_SYMBOL_GPL(udp_destruct_sock);
int udp_init_sock(struct sock *sk)
{
@@ -1482,7 +1486,6 @@ int udp_init_sock(struct sock *sk)
sk->sk_destruct = udp_destruct_sock;
return 0;
}
-EXPORT_SYMBOL_GPL(udp_init_sock);
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
{
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..d03d74388870 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -186,6 +186,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
void udp_tunnel_sock_release(struct socket *sock)
{
rcu_assign_sk_user_data(sock->sk, NULL);
+ synchronize_rcu();
kernel_sock_shutdown(sock, SHUT_RDWR);
sock_release(sock);
}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..27173549b000 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -20,6 +20,14 @@
struct udp_table udplite_table __read_mostly;
EXPORT_SYMBOL(udplite_table);
+/* Designate sk as UDP-Lite socket */
+static int udplite_sk_init(struct sock *sk)
+{
+ udp_init_sock(sk);
+ udp_sk(sk)->pcflag = UDPLITE_BIT;
+ return 0;
+}
+
static int udplite_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
@@ -56,6 +64,8 @@ struct proto udplite_prot = {
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT