aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/bpf_sk_storage.c5
-rw-r--r--net/core/datagram.c24
-rw-r--r--net/core/dev.c480
-rw-r--r--net/core/devlink.c44
-rw-r--r--net/core/drop_monitor.c11
-rw-r--r--net/core/ethtool.c4
-rw-r--r--net/core/fib_rules.c2
-rw-r--r--net/core/filter.c67
-rw-r--r--net/core/flow_dissector.c69
-rw-r--r--net/core/gen_estimator.c6
-rw-r--r--net/core/gen_stats.c12
-rw-r--r--net/core/lwt_bpf.c11
-rw-r--r--net/core/neighbour.c17
-rw-r--r--net/core/net-sysfs.c2
-rw-r--r--net/core/net_namespace.c45
-rw-r--r--net/core/netclassid_cgroup.c51
-rw-r--r--net/core/netprio_cgroup.c2
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/request_sock.c2
-rw-r--r--net/core/rtnetlink.c55
-rw-r--r--net/core/skbuff.c37
-rw-r--r--net/core/skmsg.c144
-rw-r--r--net/core/sock.c64
-rw-r--r--net/core/sock_map.c69
-rw-r--r--net/core/sock_reuseport.c1
-rw-r--r--net/core/sysctl_net_core.c2
-rw-r--r--net/core/utils.c20
27 files changed, 891 insertions, 361 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d1c4e1f3be2c..e013639b23c7 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -633,9 +633,10 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus())));
- nbuckets = 1U << smap->bucket_log;
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 45a162ef5e02..acb7d2b04a07 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -97,7 +98,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
if (error)
goto out_err;
- if (sk->sk_receive_queue.prev != skb)
+ if (READ_ONCE(sk->sk_receive_queue.prev) != skb)
goto out;
/* Socket shut down? */
@@ -278,7 +279,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
break;
sk_busy_loop(sk, flags & MSG_DONTWAIT);
- } while (sk->sk_receive_queue.prev != *last);
+ } while (READ_ONCE(sk->sk_receive_queue.prev) != *last);
error = -EAGAIN;
@@ -407,6 +408,11 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
}
EXPORT_SYMBOL(skb_kill_datagram);
+INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ size_t bytes,
+ void *data __always_unused,
+ struct iov_iter *i));
+
static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len, bool fault_short,
size_t (*cb)(const void *, size_t, void *,
@@ -420,7 +426,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- n = cb(skb->data + offset, copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
@@ -442,8 +449,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- n = cb(vaddr + frag->page_offset +
- offset - start, copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + frag->page_offset + offset - start,
+ copy, data, to);
kunmap(page);
offset += n;
if (n != copy)
@@ -640,7 +648,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
skb->len += copied;
skb->truesize += truesize;
if (sk && sk->sk_type == SOCK_STREAM) {
- sk->sk_wmem_queued += truesize;
+ sk_wmem_queued_add(sk, truesize);
sk_mem_charge(sk, truesize);
} else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
@@ -767,7 +775,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
mask = 0;
/* exceptional events? */
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
@@ -777,7 +785,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
diff --git a/net/core/dev.c b/net/core/dev.c
index 828ecca03c07..50ae8a2db946 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -79,6 +79,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -194,7 +195,7 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static seqcount_t devnet_rename_seq;
+static DECLARE_RWSEM(devnet_rename_sem);
static inline void dev_base_seq_inc(struct net *net)
{
@@ -217,14 +218,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
static inline void rps_lock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_lock(&sd->input_pkt_queue.lock);
+ raw_spin_lock(&sd->input_pkt_queue.raw_lock);
#endif
}
static inline void rps_unlock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_unlock(&sd->input_pkt_queue.lock);
+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
#endif
}
@@ -898,33 +899,28 @@ EXPORT_SYMBOL(dev_get_by_napi_id);
* @net: network namespace
* @name: a pointer to the buffer where the name will be stored.
* @ifindex: the ifindex of the interface to get the name from.
- *
- * The use of raw_seqcount_begin() and cond_resched() before
- * retrying is required as we want to give the writers a chance
- * to complete when CONFIG_PREEMPT is not set.
*/
int netdev_get_name(struct net *net, char *name, int ifindex)
{
struct net_device *dev;
- unsigned int seq;
+ int ret;
-retry:
- seq = raw_seqcount_begin(&devnet_rename_seq);
+ down_read(&devnet_rename_sem);
rcu_read_lock();
+
dev = dev_get_by_index_rcu(net, ifindex);
if (!dev) {
- rcu_read_unlock();
- return -ENODEV;
+ ret = -ENODEV;
+ goto out;
}
strcpy(name, dev->name);
- rcu_read_unlock();
- if (read_seqcount_retry(&devnet_rename_seq, seq)) {
- cond_resched();
- goto retry;
- }
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ up_read(&devnet_rename_sem);
+ return ret;
}
/**
@@ -1197,10 +1193,10 @@ int dev_change_name(struct net_device *dev, const char *newname)
likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
return -EBUSY;
- write_seqcount_begin(&devnet_rename_seq);
+ down_write(&devnet_rename_sem);
if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return 0;
}
@@ -1208,7 +1204,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
err = dev_get_valid_name(net, dev, newname);
if (err < 0) {
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return err;
}
@@ -1223,11 +1219,11 @@ rollback:
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return ret;
}
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
netdev_adjacent_rename_links(dev, oldname);
@@ -1248,7 +1244,7 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- write_seqcount_begin(&devnet_rename_seq);
+ down_write(&devnet_rename_sem);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
@@ -2750,6 +2746,7 @@ static void __netif_reschedule(struct Qdisc *q)
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
void __netif_schedule(struct Qdisc *q)
@@ -2812,6 +2809,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
__this_cpu_write(softnet_data.completion_queue, skb);
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(__dev_kfree_skb_irq);
@@ -2877,6 +2875,8 @@ static u16 skb_tx_hash(const struct net_device *dev,
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
+ if (hash >= qoffset)
+ hash -= qoffset;
while (unlikely(hash >= qcount))
hash -= qcount;
return hash + qoffset;
@@ -3475,26 +3475,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
- if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
- qdisc_run_begin(q)) {
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
- &q->state))) {
- __qdisc_drop(skb, &to_free);
- rc = NET_XMIT_DROP;
- goto end_run;
- }
- qdisc_bstats_cpu_update(q, skb);
-
- rc = NET_XMIT_SUCCESS;
- if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
- __qdisc_run(q);
-
-end_run:
- qdisc_run_end(q);
- } else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- qdisc_run(q);
- }
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ qdisc_run(q);
if (unlikely(to_free))
kfree_skb_list(to_free);
@@ -3507,7 +3489,11 @@ end_run:
* This permits qdisc->running owner to get the lock more
* often and dequeue packets faster.
*/
+#ifdef CONFIG_PREEMPT_RT_FULL
+ contended = true;
+#else
contended = qdisc_is_running(q);
+#endif
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -3942,10 +3928,12 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
local_bh_disable();
+ dev_xmit_recursion_inc();
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_drv_stopped(txq))
ret = netdev_start_xmit(skb, dev, txq, false);
HARD_TX_UNLOCK(dev, txq);
+ dev_xmit_recursion_dec();
local_bh_enable();
@@ -3969,7 +3957,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
-unsigned int __read_mostly netdev_budget_usecs = 2000;
+/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
@@ -4298,6 +4287,7 @@ drop:
rps_unlock(sd);
local_irq_restore(flags);
+ preempt_check_resched_rt();
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
@@ -4343,14 +4333,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
- if (skb_cloned(skb) || skb_is_tc_redirected(skb))
+ if (skb_is_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
* of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
* native XDP provides, thus we need to do it here as well.
*/
- if (skb_is_nonlinear(skb) ||
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
int troom = skb->tail + skb->data_len - skb->end;
@@ -4512,7 +4502,7 @@ static int netif_rx_internal(struct sk_buff *skb)
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
- preempt_disable();
+ migrate_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -4522,14 +4512,14 @@ static int netif_rx_internal(struct sk_buff *skb)
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- preempt_enable();
+ migrate_enable();
} else
#endif
{
unsigned int qtail;
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
- put_cpu();
+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
+ put_cpu_light();
}
return ret;
}
@@ -4568,11 +4558,9 @@ int netif_rx_ni(struct sk_buff *skb)
trace_netif_rx_ni_entry(skb);
- preempt_disable();
+ local_bh_disable();
err = netif_rx_internal(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
+ local_bh_enable();
trace_netif_rx_ni_exit(err);
return err;
@@ -4818,11 +4806,12 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
+ struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
@@ -4853,8 +4842,10 @@ another_round:
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
preempt_enable();
- if (ret2 != XDP_PASS)
- return NET_RX_DROP;
+ if (ret2 != XDP_PASS) {
+ ret = NET_RX_DROP;
+ goto out;
+ }
skb_reset_mac_len(skb);
}
@@ -4894,7 +4885,7 @@ skip_taps:
goto out;
}
#endif
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -5004,6 +4995,13 @@ drop:
}
out:
+ /* The invariant here is that if *ppt_prev is not NULL
+ * then skb should also be non-NULL.
+ *
+ * Apparently *ppt_prev assignment above holds this invariant due to
+ * skb dereferencing near it.
+ */
+ *pskb = skb;
return ret;
}
@@ -5013,7 +5011,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
struct packet_type *pt_prev = NULL;
int ret;
- ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
@@ -5091,7 +5089,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_prev = NULL;
skb_list_del_init(skb);
- __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (!pt_prev)
continue;
if (pt_curr != pt_prev || od_curr != orig_dev) {
@@ -5326,7 +5324,7 @@ static void flush_backlog(struct work_struct *work)
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
- kfree_skb(skb);
+ __skb_queue_tail(&sd->tofree_queue, skb);
input_queue_head_incr(sd);
}
}
@@ -5336,11 +5334,14 @@ static void flush_backlog(struct work_struct *work)
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
- kfree_skb(skb);
+ __skb_queue_tail(&sd->tofree_queue, skb);
input_queue_head_incr(sd);
}
}
+ if (!skb_queue_empty(&sd->tofree_queue))
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
local_bh_enable();
+
}
static void flush_all_backlogs(void)
@@ -5903,12 +5904,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
sd->rps_ipi_list = NULL;
local_irq_enable();
+ preempt_check_resched_rt();
/* Send pending IPI's to kick RPS processing on remote cpus. */
net_rps_send_ipi(remsd);
} else
#endif
local_irq_enable();
+ preempt_check_resched_rt();
}
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
@@ -5938,7 +5941,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
while (again) {
struct sk_buff *skb;
+ local_irq_disable();
while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_irq_enable();
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
@@ -5946,9 +5951,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
if (++work >= quota)
return work;
+ local_irq_disable();
}
- local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
@@ -5986,6 +5991,7 @@ void __napi_schedule(struct napi_struct *n)
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(__napi_schedule);
@@ -6022,6 +6028,7 @@ bool napi_schedule_prep(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_schedule_prep);
+#ifndef CONFIG_PREEMPT_RT_FULL
/**
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
@@ -6033,6 +6040,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
____napi_schedule(this_cpu_ptr(&softnet_data), n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
+#endif
bool napi_complete_done(struct napi_struct *n, int work_done)
{
@@ -6412,13 +6420,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
+ struct sk_buff_head tofree_q;
+ struct sk_buff *skb;
LIST_HEAD(list);
LIST_HEAD(repoll);
+ __skb_queue_head_init(&tofree_q);
+
local_irq_disable();
+ skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
+ while ((skb = __skb_dequeue(&tofree_q)))
+ kfree_skb(skb);
+
for (;;) {
struct napi_struct *n;
@@ -6616,6 +6632,21 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+static struct net_device *netdev_next_upper_dev(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+
static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
@@ -6633,28 +6664,93 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
return upper->dev;
}
+static int netdev_walk_all_upper_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.upper;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = netdev_next_upper_dev(now, &iter);
+ if (!udev)
+ break;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
void *data),
void *data)
{
- struct net_device *udev;
- struct list_head *iter;
- int ret;
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
- for (iter = &dev->adj_list.upper,
- udev = netdev_next_upper_dev_rcu(dev, &iter);
- udev;
- udev = netdev_next_upper_dev_rcu(dev, &iter)) {
- /* first is the upper device itself */
- ret = fn(udev, data);
- if (ret)
- return ret;
+ now = dev;
+ iter = &dev->adj_list.upper;
- /* then look at all of its upper devices */
- ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
- if (ret)
- return ret;
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = netdev_next_upper_dev_rcu(now, &iter);
+ if (!udev)
+ break;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
@@ -6762,31 +6858,50 @@ int netdev_walk_all_lower_dev(struct net_device *dev,
void *data),
void *data)
{
- struct net_device *ldev;
- struct list_head *iter;
- int ret;
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
- for (iter = &dev->adj_list.lower,
- ldev = netdev_next_lower_dev(dev, &iter);
- ldev;
- ldev = netdev_next_lower_dev(dev, &iter)) {
- /* first is the lower device itself */
- ret = fn(ldev, data);
- if (ret)
- return ret;
+ now = dev;
+ iter = &dev->adj_list.lower;
- /* then look at all of its lower devices */
- ret = netdev_walk_all_lower_dev(ldev, fn, data);
- if (ret)
- return ret;
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
-static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
@@ -6798,29 +6913,95 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
return lower->dev;
}
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
-int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
- int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+static u8 __netdev_upper_depth(struct net_device *dev)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ u8 max_depth = 0;
+
+ for (iter = &dev->adj_list.upper,
+ udev = netdev_next_upper_dev(dev, &iter);
+ udev;
+ udev = netdev_next_upper_dev(dev, &iter)) {
+ if (max_depth < udev->upper_level)
+ max_depth = udev->upper_level;
+ }
+
+ return max_depth;
+}
+
+static u8 __netdev_lower_depth(struct net_device *dev)
{
struct net_device *ldev;
struct list_head *iter;
- int ret;
+ u8 max_depth = 0;
for (iter = &dev->adj_list.lower,
- ldev = netdev_next_lower_dev_rcu(dev, &iter);
+ ldev = netdev_next_lower_dev(dev, &iter);
ldev;
- ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
- /* first is the lower device itself */
- ret = fn(ldev, data);
- if (ret)
- return ret;
+ ldev = netdev_next_lower_dev(dev, &iter)) {
+ if (max_depth < ldev->lower_level)
+ max_depth = ldev->lower_level;
+ }
- /* then look at all of its lower devices */
- ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
- if (ret)
- return ret;
+ return max_depth;
+}
+
+static int __netdev_update_upper_level(struct net_device *dev, void *data)
+{
+ dev->upper_level = __netdev_upper_depth(dev) + 1;
+ return 0;
+}
+
+static int __netdev_update_lower_level(struct net_device *dev, void *data)
+{
+ dev->lower_level = __netdev_lower_depth(dev) + 1;
+ return 0;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev_rcu(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
@@ -7077,6 +7258,9 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
+ if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
+ return -EMLINK;
+
if (!master) {
if (netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
@@ -7103,6 +7287,12 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
goto rollback;
+ __netdev_update_upper_level(dev, NULL);
+ netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, NULL);
+ netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+
return 0;
rollback:
@@ -7185,6 +7375,12 @@ void netdev_upper_dev_unlink(struct net_device *dev,
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
+
+ __netdev_update_upper_level(dev, NULL);
+ netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, NULL);
+ netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -7672,11 +7868,28 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
if (ops->ndo_change_mtu)
return ops->ndo_change_mtu(dev, new_mtu);
- dev->mtu = new_mtu;
+ /* Pairs with all the lockless reads of dev->mtu in the stack */
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
EXPORT_SYMBOL(__dev_set_mtu);
+int dev_validate_mtu(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
+{
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu less than device minimum");
+ return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
/**
* dev_set_mtu_ext - Change maximum transfer unit
* @dev: device
@@ -7693,16 +7906,9 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
if (new_mtu == dev->mtu)
return 0;
- /* MTU must be positive, and in range */
- if (new_mtu < 0 || new_mtu < dev->min_mtu) {
- NL_SET_ERR_MSG(extack, "mtu less than device minimum");
- return -EINVAL;
- }
-
- if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
- NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
- return -EINVAL;
- }
+ err = dev_validate_mtu(dev, new_mtu, extack);
+ if (err)
+ return err;
if (!netif_device_present(dev))
return -ENODEV;
@@ -8291,11 +8497,13 @@ static void netdev_sync_lower_features(struct net_device *upper,
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
lower->wanted_features &= ~feature;
- netdev_update_features(lower);
+ __netdev_update_features(lower);
if (unlikely(lower->features & feature))
netdev_WARN(upper, "failed to disable %pNF on %s!\n",
&feature, lower->name);
+ else
+ netdev_features_change(lower);
}
}
}
@@ -8740,8 +8948,10 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
ret = netdev_register_kobject(dev);
- if (ret)
+ if (ret) {
+ dev->reg_state = NETREG_UNREGISTERED;
goto err_uninit;
+ }
dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
@@ -8775,6 +8985,13 @@ int register_netdevice(struct net_device *dev)
rcu_barrier();
dev->reg_state = NETREG_UNREGISTERED;
+ /* We should put the kobject that hold in
+ * netdev_unregister_kobject(), otherwise
+ * the net device cannot be freed when
+ * driver calls free_netdev(), because the
+ * kobject is being hold.
+ */
+ kobject_put(&dev->dev.kobj);
}
/*
* Prevent userspace races by waiting until the network
@@ -9173,6 +9390,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->upper_level = 1;
+ dev->lower_level = 1;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
@@ -9421,7 +9640,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
- new_nsid = peernet2id_alloc(dev_net(dev), net);
+ new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
/* If there is an ifindex conflict assign a new one */
if (__dev_get_by_index(net, dev->ifindex))
new_ifindex = dev_new_index(net);
@@ -9517,6 +9736,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ preempt_check_resched_rt();
#ifdef CONFIG_RPS
remsd = oldsd->rps_ipi_list;
@@ -9530,10 +9750,13 @@ static int dev_cpu_dead(unsigned int oldcpu)
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
+ while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
+ kfree_skb(skb);
+ }
return 0;
}
@@ -9844,8 +10067,9 @@ static int __init net_dev_init(void)
INIT_WORK(flush, flush_backlog);
- skb_queue_head_init(&sd->input_pkt_queue);
- skb_queue_head_init(&sd->process_queue);
+ skb_queue_head_init_raw(&sd->input_pkt_queue);
+ skb_queue_head_init_raw(&sd->process_queue);
+ skb_queue_head_init_raw(&sd->tofree_queue);
#ifdef CONFIG_XFRM_OFFLOAD
skb_queue_head_init(&sd->xfrm_backlog);
#endif
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 132f4b757963..119d203b0723 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3068,34 +3068,41 @@ devlink_param_value_get_from_info(const struct devlink_param *param,
struct genl_info *info,
union devlink_param_value *value)
{
+ struct nlattr *param_data;
int len;
- if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
- !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
+ param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA];
+
+ if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data)
return -EINVAL;
switch (param->type) {
case DEVLINK_PARAM_TYPE_U8:
- value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ if (nla_len(param_data) != sizeof(u8))
+ return -EINVAL;
+ value->vu8 = nla_get_u8(param_data);
break;
case DEVLINK_PARAM_TYPE_U16:
- value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ if (nla_len(param_data) != sizeof(u16))
+ return -EINVAL;
+ value->vu16 = nla_get_u16(param_data);
break;
case DEVLINK_PARAM_TYPE_U32:
- value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ if (nla_len(param_data) != sizeof(u32))
+ return -EINVAL;
+ value->vu32 = nla_get_u32(param_data);
break;
case DEVLINK_PARAM_TYPE_STRING:
- len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]),
- nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
- if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) ||
+ len = strnlen(nla_data(param_data), nla_len(param_data));
+ if (len == nla_len(param_data) ||
len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
return -EINVAL;
- strcpy(value->vstr,
- nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
+ strcpy(value->vstr, nla_data(param_data));
break;
case DEVLINK_PARAM_TYPE_BOOL:
- value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
- true : false;
+ if (param_data && nla_len(param_data))
+ return -EINVAL;
+ value->vbool = nla_get_flag(param_data);
break;
}
return 0;
@@ -3700,6 +3707,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
goto out_unlock;
}
+ /* return 0 if there is no further data to read */
+ if (start_offset >= region->size) {
+ err = 0;
+ goto out_unlock;
+ }
+
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
DEVLINK_CMD_REGION_READ);
@@ -3731,6 +3744,11 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
dump = false;
+
+ if (start_offset == end_offset) {
+ err = 0;
+ goto nla_put_failure;
+ }
}
err = devlink_nl_region_read_snapshot_fill(skb, devlink,
@@ -4979,6 +4997,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 },
[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 4ea4347f5062..6a5cca21cab5 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -155,6 +155,7 @@ static void sched_send_work(struct timer_list *t)
static void trace_drop_common(struct sk_buff *skb, void *location)
{
struct net_dm_alert_msg *msg;
+ struct net_dm_drop_point *point;
struct nlmsghdr *nlh;
struct nlattr *nla;
int i;
@@ -173,11 +174,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
nlh = (struct nlmsghdr *)dskb->data;
nla = genlmsg_data(nlmsg_data(nlh));
msg = nla_data(nla);
+ point = msg->points;
for (i = 0; i < msg->entries; i++) {
- if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
- msg->points[i].count++;
+ if (!memcmp(&location, &point->pc, sizeof(void *))) {
+ point->count++;
goto out;
}
+ point++;
}
if (msg->entries == dm_hit_limit)
goto out;
@@ -186,8 +189,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
*/
__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
- memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
- msg->points[msg->entries].count = 1;
+ memcpy(point->pc, &location, sizeof(void *));
+ point->count = 1;
msg->entries++;
if (!timer_pending(&data->send_timer)) {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4d1011b2e24f..427ae856a8ac 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1395,11 +1395,13 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+ struct ethtool_wolinfo wol;
if (!dev->ethtool_ops->get_wol)
return -EOPNOTSUPP;
+ memset(&wol, 0, sizeof(struct ethtool_wolinfo));
+ wol.cmd = ETHTOOL_GWOL;
dev->ethtool_ops->get_wol(dev, &wol);
if (copy_to_user(useraddr, &wol, sizeof(wol)))
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index dd220ce7ca7a..bb11fc87bbae 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -967,7 +967,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh = nlmsg_data(nlh);
frh->family = ops->family;
- frh->table = rule->table;
+ frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
if (nla_put_u32(skb, FRA_TABLE, rule->table))
goto nla_put_failure;
if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
diff --git a/net/core/filter.c b/net/core/filter.c
index 7aee6f368754..633b048792b2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1765,25 +1765,27 @@ BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
u32, offset, void *, to, u32, len, u32, start_header)
{
u8 *end = skb_tail_pointer(skb);
- u8 *net = skb_network_header(skb);
- u8 *mac = skb_mac_header(skb);
- u8 *ptr;
+ u8 *start, *ptr;
- if (unlikely(offset > 0xffff || len > (end - mac)))
+ if (unlikely(offset > 0xffff))
goto err_clear;
switch (start_header) {
case BPF_HDR_START_MAC:
- ptr = mac + offset;
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ goto err_clear;
+ start = skb_mac_header(skb);
break;
case BPF_HDR_START_NET:
- ptr = net + offset;
+ start = skb_network_header(skb);
break;
default:
goto err_clear;
}
- if (likely(ptr >= mac && ptr + len <= end)) {
+ ptr = start + offset;
+
+ if (likely(ptr + len <= end)) {
memcpy(to, ptr, len);
return 0;
}
@@ -2054,6 +2056,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
}
skb->dev = dev;
+ skb->tstamp = 0;
dev_xmit_recursion_inc();
ret = dev_queue_xmit(skb);
@@ -2229,10 +2232,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += len;
len = sk_msg_elem(msg, i)->length;
if (start < offset + len)
break;
- offset += len;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -2244,7 +2247,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!msg->sg.copy[i] && bytes_sg_total <= len)
+ if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2298,7 +2301,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
WARN_ON_ONCE(last_sge == first_sge);
shift = last_sge > first_sge ?
last_sge - first_sge - 1 :
- MAX_SKB_FRAGS - first_sge + last_sge - 1;
+ NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
if (!shift)
goto out;
@@ -2307,8 +2310,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
do {
u32 move_from;
- if (i + shift >= MAX_MSG_FRAGS)
- move_from = i + shift - MAX_MSG_FRAGS;
+ if (i + shift >= NR_MSG_FRAG_IDS)
+ move_from = i + shift - NR_MSG_FRAG_IDS;
else
move_from = i + shift;
if (move_from == msg->sg.end)
@@ -2322,7 +2325,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
} while (1);
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
- msg->sg.end - shift + MAX_MSG_FRAGS :
+ msg->sg.end - shift + NR_MSG_FRAG_IDS :
msg->sg.end - shift;
out:
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
@@ -2344,7 +2347,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
- u32 new, i = 0, l, space, copy = 0, offset = 0;
+ u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
u8 *raw, *to, *from;
struct page *page;
@@ -2354,11 +2357,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
- offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -2413,6 +2416,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
+ sg_unmark_end(&rsge);
sk_msg_iter_next(msg, end);
}
@@ -2449,7 +2453,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- msg->sg.copy[new] = false;
+ __clear_bit(new, &msg->sg.copy);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
@@ -2504,7 +2508,7 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i)
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
- u32 i = 0, l, space, offset = 0;
+ u32 i = 0, l = 0, space, offset = 0;
u64 last = start + len;
int pop;
@@ -2514,11 +2518,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
- offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -2587,8 +2591,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
}
pop = 0;
} else if (pop >= sge->length - a) {
- sge->length = a;
pop -= (sge->length - a);
+ sge->length = a;
}
}
@@ -2912,7 +2916,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
{
- __be16 from_proto = skb->protocol;
+ __be16 from_proto = skb_protocol(skb, true);
if (from_proto == htons(ETH_P_IP) &&
to_proto == htons(ETH_P_IPV6))
@@ -2985,7 +2989,7 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
{
- switch (skb->protocol) {
+ switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
return sizeof(struct iphdr);
case htons(ETH_P_IPV6):
@@ -3153,7 +3157,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32 len_cur, len_diff_abs = abs(len_diff);
u32 len_min = bpf_skb_net_base_len(skb);
u32 len_max = __bpf_skb_max_len(skb);
- __be16 proto = skb->protocol;
+ __be16 proto = skb_protocol(skb, true);
bool shrink = len_diff < 0;
u32 off;
int ret;
@@ -3541,7 +3545,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
return err;
}
default:
- break;
+ return -EBADRQC;
}
return 0;
}
@@ -4244,12 +4248,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
case SO_RCVBUF:
val = min_t(u32, val, sysctl_rmem_max);
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;
case SO_SNDBUF:
val = min_t(u32, val, sysctl_wmem_max);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
break;
case SO_MAX_PACING_RATE: /* 32bit version */
if (val != ~0U)
@@ -4916,7 +4922,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
switch (type) {
case BPF_LWT_ENCAP_SEG6_INLINE:
- if (skb->protocol != htons(ETH_P_IPV6))
+ if (skb_protocol(skb, true) != htons(ETH_P_IPV6))
return -EBADMSG;
err = seg6_do_srh_inline(skb, srh);
@@ -5342,8 +5348,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ sock_gen_put(sk);
return NULL;
}
}
@@ -5380,8 +5385,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ sock_gen_put(sk);
return NULL;
}
}
@@ -5448,7 +5452,8 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- if (!sock_flag(sk, SOCK_RCU_FREE))
+ /* Only full sockets have sk->sk_flags. */
+ if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
sock_gen_put(sk);
return 0;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b15c0c0f6e55..11c23403b3f3 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -125,12 +125,10 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
return 0;
}
-int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+static int flow_dissector_bpf_prog_detach(struct net *net)
{
struct bpf_prog *attached;
- struct net *net;
- net = current->nsproxy->net_ns;
mutex_lock(&flow_dissector_mutex);
attached = rcu_dereference_protected(net->flow_dissector_prog,
lockdep_is_held(&flow_dissector_mutex));
@@ -165,6 +163,24 @@ static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
return 0;
}
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+{
+ return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns);
+}
+
+static void __net_exit flow_dissector_pernet_pre_exit(struct net *net)
+{
+ /* We're not racing with attach/detach because there are no
+ * references to netns left when pre_exit gets called.
+ */
+ if (rcu_access_pointer(net->flow_dissector_prog))
+ flow_dissector_bpf_prog_detach(net);
+}
+
+static struct pernet_operations flow_dissector_pernet_ops __net_initdata = {
+ .pre_exit = flow_dissector_pernet_pre_exit,
+};
+
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
@@ -793,9 +809,10 @@ bool __skb_flow_dissect(const struct net *net,
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
#if IS_ENABLED(CONFIG_NET_DSA)
- if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) {
+ if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
+ proto == htons(ETH_P_XDSA))) {
const struct dsa_device_ops *ops;
- int offset;
+ int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
if (ops->flow_dissect &&
@@ -1281,30 +1298,21 @@ out_bad:
}
EXPORT_SYMBOL(__skb_flow_dissect);
-static u32 hashrnd __read_mostly;
+static siphash_key_t hashrnd __read_mostly;
static __always_inline void __flow_hash_secret_init(void)
{
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
-static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
- u32 keyval)
-{
- return jhash2(words, length, keyval);
-}
-
-static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
+static const void *flow_keys_hash_start(const struct flow_keys *flow)
{
- const void *p = flow;
-
- BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
- return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
+ BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
+ return &flow->FLOW_KEYS_HASH_START_FIELD;
}
static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
{
size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
- BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
sizeof(*flow) - sizeof(flow->addrs));
@@ -1319,7 +1327,7 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
diff -= sizeof(flow->addrs.tipckey);
break;
}
- return (sizeof(*flow) - diff) / sizeof(u32);
+ return sizeof(*flow) - diff;
}
__be32 flow_get_u32_src(const struct flow_keys *flow)
@@ -1385,14 +1393,15 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
}
}
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
+ const siphash_key_t *keyval)
{
u32 hash;
__flow_hash_consistentify(keys);
- hash = __flow_hash_words(flow_keys_hash_start(keys),
- flow_keys_hash_length(keys), keyval);
+ hash = siphash(flow_keys_hash_start(keys),
+ flow_keys_hash_length(keys), keyval);
if (!hash)
hash = 1;
@@ -1402,12 +1411,13 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
u32 flow_hash_from_keys(struct flow_keys *keys)
{
__flow_hash_secret_init();
- return __flow_hash_from_keys(keys, hashrnd);
+ return __flow_hash_from_keys(keys, &hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);
static inline u32 ___skb_get_hash(const struct sk_buff *skb,
- struct flow_keys *keys, u32 keyval)
+ struct flow_keys *keys,
+ const siphash_key_t *keyval)
{
skb_flow_dissect_flow_keys(skb, keys,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
@@ -1455,7 +1465,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
&keys, NULL, 0, 0, 0,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
- return __flow_hash_from_keys(&keys, hashrnd);
+ return __flow_hash_from_keys(&keys, &hashrnd);
}
EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
@@ -1475,13 +1485,14 @@ void __skb_get_hash(struct sk_buff *skb)
__flow_hash_secret_init();
- hash = ___skb_get_hash(skb, &keys, hashrnd);
+ hash = ___skb_get_hash(skb, &keys, &hashrnd);
__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash);
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+__u32 skb_get_hash_perturb(const struct sk_buff *skb,
+ const siphash_key_t *perturb)
{
struct flow_keys keys;
@@ -1669,7 +1680,7 @@ static int __init init_default_flow_dissectors(void)
skb_flow_dissector_init(&flow_keys_basic_dissector,
flow_keys_basic_dissector_keys,
ARRAY_SIZE(flow_keys_basic_dissector_keys));
- return 0;
-}
+ return register_pernet_subsys(&flow_dissector_pernet_ops);
+}
core_initcall(init_default_flow_dissectors);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index bfe7bdd4c340..aa6d9e3d2705 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -42,7 +42,7 @@
struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
spinlock_t *stats_lock;
- seqcount_t *running;
+ net_seqlock_t *running;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
u8 ewma_log;
u8 intvl_log; /* period : (250ms << intvl_log) */
@@ -125,7 +125,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running,
+ net_seqlock_t *running,
struct nlattr *opt)
{
struct gnet_estimator *parm = nla_data(opt);
@@ -223,7 +223,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt)
+ net_seqlock_t *running, struct nlattr *opt)
{
return gen_new_estimator(bstats, cpu_bstats, rate_est,
lock, running, opt);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 36888f5e09eb..4894ee0a1b04 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -138,7 +138,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
}
void
-__gnet_stats_copy_basic(const seqcount_t *running,
+__gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
@@ -151,15 +151,15 @@ __gnet_stats_copy_basic(const seqcount_t *running,
}
do {
if (running)
- seq = read_seqcount_begin(running);
+ seq = net_seq_begin(running);
bstats->bytes = b->bytes;
bstats->packets = b->packets;
- } while (running && read_seqcount_retry(running, seq));
+ } while (running && net_seq_retry(running, seq));
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);
static int
-___gnet_stats_copy_basic(const seqcount_t *running,
+___gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b,
@@ -200,7 +200,7 @@ ___gnet_stats_copy_basic(const seqcount_t *running,
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(const seqcount_t *running,
+gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
@@ -224,7 +224,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic_hw(const seqcount_t *running,
+gnet_stats_copy_basic_hw(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index f93785e5833c..99a6de52b21d 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -88,11 +88,16 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb)
int err = -EINVAL;
if (skb->protocol == htons(ETH_P_IP)) {
+ struct net_device *dev = skb_dst(skb)->dev;
struct iphdr *iph = ip_hdr(skb);
+ dev_hold(dev);
+ skb_dst_drop(skb);
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb_dst(skb)->dev);
+ iph->tos, dev);
+ dev_put(dev);
} else if (skb->protocol == htons(ETH_P_IPV6)) {
+ skb_dst_drop(skb);
err = ipv6_stub->ipv6_route_input(skb);
} else {
err = -EAFNOSUPPORT;
@@ -225,9 +230,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
fl6.daddr = iph6->daddr;
fl6.saddr = iph6->saddr;
- err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
- if (unlikely(err))
- goto err;
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto err;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 5bb0a1aee50e..6cdb1159f17b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -98,9 +98,6 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
- if (neigh->parms->neigh_cleanup)
- neigh->parms->neigh_cleanup(neigh);
-
trace_neigh_cleanup_and_release(neigh, 0);
__neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
@@ -1195,7 +1192,7 @@ static void neigh_update_hhs(struct neighbour *neigh)
if (update) {
hh = &neigh->hh;
- if (hh->hh_len) {
+ if (READ_ONCE(hh->hh_len)) {
write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
write_sequnlock_bh(&hh->hh_lock);
@@ -1474,7 +1471,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
struct net_device *dev = neigh->dev;
unsigned int seq;
- if (dev->header_ops->cache && !neigh->hh.hh_len)
+ if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
neigh_hh_init(neigh);
do {
@@ -1955,6 +1952,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
}
+ if (protocol)
+ neigh->protocol = protocol;
+
if (ndm->ndm_flags & NTF_EXT_LEARNED)
flags |= NEIGH_UPDATE_F_EXT_LEARNED;
@@ -1968,9 +1968,6 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
NETLINK_CB(skb).portid, extack);
- if (protocol)
- neigh->protocol = protocol;
-
neigh_release(neigh);
out:
@@ -2050,8 +2047,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
goto nla_put_failure;
{
unsigned long now = jiffies;
- unsigned int flush_delta = now - tbl->last_flush;
- unsigned int rand_delta = now - tbl->last_rand;
+ long flush_delta = now - tbl->last_flush;
+ long rand_delta = now - tbl->last_rand;
struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 865ba6ca16eb..229ecf256f4e 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1031,7 +1031,7 @@ static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
trans_timeout = queue->trans_timeout;
spin_unlock_irq(&queue->_xmit_lock);
- return sprintf(buf, "%lu", trans_timeout);
+ return sprintf(buf, fmt_ulong, trans_timeout);
}
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 15f68842ac6b..e31ab1f9c91d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -145,6 +145,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
}
}
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ if (ops->pre_exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->pre_exit(net);
+ }
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
@@ -227,11 +238,11 @@ static int __peernet2id(struct net *net, struct net *peer)
return __peernet2id_alloc(net, peer, &no);
}
-static void rtnl_net_notifyid(struct net *net, int cmd, int id);
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp);
/* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned.
*/
-int peernet2id_alloc(struct net *net, struct net *peer)
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
bool alloc = false, alive = false;
int id;
@@ -250,7 +261,7 @@ int peernet2id_alloc(struct net *net, struct net *peer)
id = __peernet2id_alloc(net, peer, &alloc);
spin_unlock_bh(&net->nsid_lock);
if (alloc && id >= 0)
- rtnl_net_notifyid(net, RTM_NEWNSID, id);
+ rtnl_net_notifyid(net, RTM_NEWNSID, id, gfp);
if (alive)
put_net(peer);
return id;
@@ -330,6 +341,12 @@ out_undo:
list_add(&net->exit_list, &net_exit_list);
saved_ops = ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
+ synchronize_rcu();
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
ops = saved_ops;
@@ -496,7 +513,8 @@ static void unhash_nsid(struct net *net, struct net *last)
idr_remove(&tmp->netns_ids, id);
spin_unlock_bh(&tmp->nsid_lock);
if (id >= 0)
- rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+ rtnl_net_notifyid(tmp, RTM_DELNSID, id,
+ GFP_KERNEL);
if (tmp == last)
break;
}
@@ -541,10 +559,15 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
+ /* Run all of the network namespace pre_exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
/*
* Another CPU might be rcu-iterating the list, wait for it.
* This needs to be before calling the exit() notifiers, so
* the rcu_barrier() below isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
*/
synchronize_rcu();
@@ -722,7 +745,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
err = alloc_netid(net, peer, nsid);
spin_unlock_bh(&net->nsid_lock);
if (err >= 0) {
- rtnl_net_notifyid(net, RTM_NEWNSID, err);
+ rtnl_net_notifyid(net, RTM_NEWNSID, err, GFP_KERNEL);
err = 0;
} else if (err == -ENOSPC && nsid >= 0) {
err = -EEXIST;
@@ -1009,7 +1032,7 @@ end:
return err < 0 ? err : skb->len;
}
-static void rtnl_net_notifyid(struct net *net, int cmd, int id)
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp)
{
struct net_fill_args fillargs = {
.cmd = cmd,
@@ -1018,7 +1041,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id)
struct sk_buff *msg;
int err = -ENOMEM;
- msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ msg = nlmsg_new(rtnl_net_get_size(), gfp);
if (!msg)
goto out;
@@ -1026,7 +1049,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id)
if (err < 0)
goto err_out;
- rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
+ rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, gfp);
return;
err_out:
@@ -1101,6 +1124,8 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
return error;
@@ -1115,6 +1140,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
@@ -1139,6 +1166,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
} else {
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 0642f91c4038..41b24cd31562 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -53,30 +53,60 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
kfree(css_cls_state(css));
}
+/*
+ * To avoid freezing of sockets creation for tasks with big number of threads
+ * and opened sockets lets release file_lock every 1000 iterated descriptors.
+ * New sockets will already have been created with new classid.
+ */
+
+struct update_classid_context {
+ u32 classid;
+ unsigned int batch;
+};
+
+#define UPDATE_CLASSID_BATCH 1000
+
static int update_classid_sock(const void *v, struct file *file, unsigned n)
{
int err;
+ struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file, &err);
if (sock) {
spin_lock(&cgroup_sk_update_lock);
- sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
- (unsigned long)v);
+ sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
spin_unlock(&cgroup_sk_update_lock);
}
+ if (--ctx->batch == 0) {
+ ctx->batch = UPDATE_CLASSID_BATCH;
+ return n + 1;
+ }
return 0;
}
+static void update_classid_task(struct task_struct *p, u32 classid)
+{
+ struct update_classid_context ctx = {
+ .classid = classid,
+ .batch = UPDATE_CLASSID_BATCH
+ };
+ unsigned int fd = 0;
+
+ do {
+ task_lock(p);
+ fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
+ task_unlock(p);
+ cond_resched();
+ } while (fd);
+}
+
static void cgrp_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct task_struct *p;
cgroup_taskset_for_each(p, css, tset) {
- task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock,
- (void *)(unsigned long)css_cls_state(css)->classid);
- task_unlock(p);
+ update_classid_task(p, css_cls_state(css)->classid);
}
}
@@ -97,13 +127,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
- task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock,
- (void *)(unsigned long)cs->classid);
- task_unlock(p);
- cond_resched();
- }
+ while ((p = css_task_iter_next(&it)))
+ update_classid_task(p, cs->classid);
css_task_iter_end(&it);
return 0;
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 256b7954b720..8618242c677a 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -236,6 +236,8 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
+ cgroup_sk_alloc_disable();
+
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->cgroup->id;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f975c5e2a369..f243b3988889 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2154,7 +2154,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
s64 remaining;
struct hrtimer_sleeper t;
- hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS,
+ current);
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
@@ -2169,7 +2170,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
} while (ktime_compare(end_time, spin_until) < 0);
} else {
/* see do_nanosleep */
- hrtimer_init_sleeper(&t, current);
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
@@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* skb was 'freed' by stack, so clean few
* bits and reuse it
*/
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index c9bb00008528..f35c2e998406 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -96,7 +96,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
- tcp_sk(sk)->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
spin_lock_bh(&fastopenq->lock);
fastopenq->qlen--;
tcp_rsk(req)->tfo_listener = false;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cec60583931f..9a504779c4f5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1515,7 +1515,7 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
static int rtnl_fill_link_netnsid(struct sk_buff *skb,
const struct net_device *dev,
- struct net *src_net)
+ struct net *src_net, gfp_t gfp)
{
bool put_iflink = false;
@@ -1523,7 +1523,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb,
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
if (!net_eq(dev_net(dev), link_net)) {
- int id = peernet2id_alloc(src_net, link_net);
+ int id = peernet2id_alloc(src_net, link_net, gfp);
if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
return -EMSGSIZE;
@@ -1581,7 +1581,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask,
u32 event, int *new_nsid, int new_ifindex,
- int tgt_netnsid)
+ int tgt_netnsid, gfp_t gfp)
{
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
@@ -1673,7 +1673,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
- if (rtnl_fill_link_netnsid(skb, dev, src_net))
+ if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp))
goto nla_put_failure;
if (new_nsid &&
@@ -1992,7 +1992,7 @@ walk_entries:
NETLINK_CB(cb->skb).portid,
nlh->nlmsg_seq, 0, flags,
ext_filter_mask, 0, NULL, 0,
- netnsid);
+ netnsid, GFP_KERNEL);
if (err < 0) {
if (likely(skb->len))
@@ -2186,6 +2186,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_MAC]) {
struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
+ if (ivm->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_mac)
err = ops->ndo_set_vf_mac(dev, ivm->vf,
@@ -2197,6 +2199,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_VLAN]) {
struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+ if (ivv->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
@@ -2229,6 +2233,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (len == 0)
return -EINVAL;
+ if (ivvl[0]->vf >= INT_MAX)
+ return -EINVAL;
err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
@@ -2239,6 +2245,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
struct ifla_vf_info ivf;
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_get_vf_config)
err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
@@ -2257,6 +2265,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_RATE]) {
struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_rate)
err = ops->ndo_set_vf_rate(dev, ivt->vf,
@@ -2269,6 +2279,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_SPOOFCHK]) {
struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+ if (ivs->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_spoofchk)
err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
@@ -2280,6 +2292,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_LINK_STATE]) {
struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+ if (ivl->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_link_state)
err = ops->ndo_set_vf_link_state(dev, ivl->vf,
@@ -2293,6 +2307,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+ if (ivrssq_en->vf >= INT_MAX)
+ return -EINVAL;
if (ops->ndo_set_vf_rss_query_en)
err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
ivrssq_en->setting);
@@ -2303,6 +2319,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_TRUST]) {
struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_trust)
err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
@@ -2313,15 +2331,18 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_IB_NODE_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
-
return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
}
if (tb[IFLA_VF_IB_PORT_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
@@ -2928,8 +2949,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
dev->rtnl_link_ops = ops;
dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
- if (tb[IFLA_MTU])
- dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+ if (tb[IFLA_MTU]) {
+ u32 mtu = nla_get_u32(tb[IFLA_MTU]);
+ int err;
+
+ err = dev_validate_mtu(dev, mtu, extack);
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+ dev->mtu = mtu;
+ }
if (tb[IFLA_ADDRESS]) {
memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
nla_len(tb[IFLA_ADDRESS]));
@@ -3191,7 +3221,8 @@ replay:
*/
if (err < 0) {
/* If device is not registered at all, free it now */
- if (dev->reg_state == NETREG_UNINITIALIZED)
+ if (dev->reg_state == NETREG_UNINITIALIZED ||
+ dev->reg_state == NETREG_UNREGISTERED)
free_netdev(dev);
goto out;
}
@@ -3350,7 +3381,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
err = rtnl_fill_ifinfo(nskb, dev, net,
RTM_NEWLINK, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, 0, 0, ext_filter_mask,
- 0, NULL, 0, netnsid);
+ 0, NULL, 0, netnsid, GFP_KERNEL);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size */
WARN_ON(err == -EMSGSIZE);
@@ -3462,7 +3493,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
type, 0, 0, change, 0, 0, event,
- new_nsid, new_ifindex, -1);
+ new_nsid, new_ifindex, -1, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3907,7 +3938,7 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_flags || ndm->ndm_type) {
- NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request");
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
return -EINVAL;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 74efd63f15e2..ee080e44221b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -59,6 +59,7 @@
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
+#include <linux/locallock.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -364,6 +365,8 @@ struct napi_alloc_cache {
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
+static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
@@ -371,10 +374,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
unsigned long flags;
void *data;
- local_irq_save(flags);
+ local_lock_irqsave(netdev_alloc_lock, flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, fragsz, gfp_mask);
- local_irq_restore(flags);
+ local_unlock_irqrestore(netdev_alloc_lock, flags);
return data;
}
@@ -395,9 +398,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc;
+ void *data;
- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+ data = page_frag_alloc(&nc->page, fragsz, gfp_mask);
+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+ return data;
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -446,13 +453,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- local_irq_save(flags);
+ local_lock_irqsave(netdev_alloc_lock, flags);
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
- local_irq_restore(flags);
+ local_unlock_irqrestore(netdev_alloc_lock, flags);
if (unlikely(!data))
return NULL;
@@ -493,9 +500,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc;
struct sk_buff *skb;
void *data;
+ bool pfmemalloc;
len += NET_SKB_PAD + NET_IP_ALIGN;
@@ -513,7 +521,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = nc->page.pfmemalloc;
+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
if (unlikely(!data))
return NULL;
@@ -524,7 +535,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
}
/* use OR instead of assignment to avoid clearing of bits in mask */
- if (nc->page.pfmemalloc)
+ if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -753,23 +764,26 @@ void __consume_stateless_skb(struct sk_buff *skb)
void __kfree_skb_flush(void)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc;
+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
/* flush skb_cache if containing objects */
if (nc->skb_count) {
kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
nc->skb_cache);
nc->skb_count = 0;
}
+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
}
static inline void _kfree_skb_defer(struct sk_buff *skb)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc;
/* drop skb->head and call any destructors for packet */
skb_release_all(skb);
+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
/* record skb to CPU local list */
nc->skb_cache[nc->skb_count++] = skb;
@@ -784,6 +798,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
nc->skb_cache);
nc->skb_count = 0;
}
+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
}
void __kfree_skb_defer(struct sk_buff *skb)
{
@@ -4270,7 +4285,7 @@ static void skb_set_err_queue(struct sk_buff *skb)
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned int)sk->sk_rcvbuf)
+ (unsigned int)READ_ONCE(sk->sk_rcvbuf))
return -ENOMEM;
skb_orphan(skb);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 93bffaad2135..e16f985631ae 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -7,6 +7,7 @@
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/tls.h>
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
{
@@ -271,18 +272,28 @@ void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
msg->sg.data[i].length -= trim;
sk_mem_uncharge(sk, trim);
+ /* Adjust copybreak if it falls into the trimmed part of last buf */
+ if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length)
+ msg->sg.copybreak = msg->sg.data[i].length;
out:
- /* If we trim data before curr pointer update copybreak and current
- * so that any future copy operations start at new copy location.
+ sk_msg_iter_var_next(i);
+ msg->sg.end = i;
+
+ /* If we trim data a full sg elem before curr pointer update
+ * copybreak and current so that any future copy operations
+ * start at new copy location.
* However trimed data that has not yet been used in a copy op
* does not require an update.
*/
- if (msg->sg.curr >= i) {
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >=
+ sk_msg_iter_dist(msg->sg.start, msg->sg.end)) {
+ sk_msg_iter_var_prev(i);
msg->sg.curr = i;
msg->sg.copybreak = msg->sg.data[i].length;
}
- sk_msg_iter_var_next(i);
- msg->sg.end = i;
}
EXPORT_SYMBOL_GPL(sk_msg_trim);
@@ -412,7 +423,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
copied = skb->len;
msg->sg.start = 0;
msg->sg.size = copied;
- msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+ msg->sg.end = num_sge;
msg->skb = skb;
sk_psock_queue_msg(psock, msg);
@@ -677,13 +688,74 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
return container_of(parser, struct sk_psock, parser);
}
-static void sk_psock_verdict_apply(struct sk_psock *psock,
- struct sk_buff *skb, int verdict)
+static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
bool ingress;
+ sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ if (unlikely(!sk_other)) {
+ kfree_skb(skb);
+ return;
+ }
+ psock_other = sk_psock(sk_other);
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+ !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ ingress = tcp_skb_bpf_ingress(skb);
+ if ((!ingress && sock_writeable(sk_other)) ||
+ (ingress &&
+ atomic_read(&sk_other->sk_rmem_alloc) <=
+ sk_other->sk_rcvbuf)) {
+ if (!ingress)
+ skb_set_owner_w(skb, sk_other);
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_work(&psock_other->work);
+ } else {
+ kfree_skb(skb);
+ }
+}
+
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
+{
+ switch (verdict) {
+ case __SK_REDIRECT:
+ sk_psock_skb_redirect(skb);
+ break;
+ case __SK_PASS:
+ case __SK_DROP:
+ default:
+ break;
+ }
+}
+
+int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog;
+ int ret = __SK_PASS;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ tcp_skb_bpf_redirect_clear(skb);
+ ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ }
+ sk_psock_tls_verdict_apply(skb, ret);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+ struct sk_buff *skb, int verdict)
+{
+ struct sock *sk_other;
+
switch (verdict) {
case __SK_PASS:
sk_other = psock->sk;
@@ -702,25 +774,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
}
goto out_free;
case __SK_REDIRECT:
- sk_other = tcp_skb_bpf_redirect_fetch(skb);
- if (unlikely(!sk_other))
- goto out_free;
- psock_other = sk_psock(sk_other);
- if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
- goto out_free;
- ingress = tcp_skb_bpf_ingress(skb);
- if ((!ingress && sock_writeable(sk_other)) ||
- (ingress &&
- atomic_read(&sk_other->sk_rmem_alloc) <=
- sk_other->sk_rcvbuf)) {
- if (!ingress)
- skb_set_owner_w(skb, sk_other);
- skb_queue_tail(&psock_other->ingress_skb, skb);
- schedule_work(&psock_other->work);
- break;
- }
- /* fall-through */
+ sk_psock_skb_redirect(skb);
+ break;
case __SK_DROP:
/* fall-through */
default:
@@ -731,11 +786,18 @@ out_free:
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
- struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct sk_psock *psock;
struct bpf_prog *prog;
int ret = __SK_DROP;
+ struct sock *sk;
rcu_read_lock();
+ sk = strp->sk;
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ kfree_skb(skb);
+ goto out;
+ }
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb_orphan(skb);
@@ -743,8 +805,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
}
- rcu_read_unlock();
sk_psock_verdict_apply(psock, skb, ret);
+out:
+ rcu_read_unlock();
}
static int sk_psock_strp_read_done(struct strparser *strp, int err)
@@ -774,9 +837,13 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock)) {
- write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->parser.strp);
- write_unlock_bh(&sk->sk_callback_lock);
+ if (tls_sw_has_ctx_rx(sk)) {
+ psock->parser.saved_data_ready(sk);
+ } else {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->parser.strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
}
rcu_read_unlock();
}
@@ -784,15 +851,18 @@ static void sk_psock_strp_data_ready(struct sock *sk)
static void sk_psock_write_space(struct sock *sk)
{
struct sk_psock *psock;
- void (*write_space)(struct sock *sk);
+ void (*write_space)(struct sock *sk) = NULL;
rcu_read_lock();
psock = sk_psock(sk);
- if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
rcu_read_unlock();
- write_space(sk);
+ if (write_space)
+ write_space(sk);
}
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
diff --git a/net/core/sock.c b/net/core/sock.c
index 3d54153b8325..fb83335a5abb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -709,7 +709,7 @@ bool sk_mc_loop(struct sock *sk)
return inet6_sk(sk)->mc_loop;
#endif
}
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return true;
}
EXPORT_SYMBOL(sk_mc_loop);
@@ -785,7 +785,8 @@ set_sndbuf:
*/
val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -831,7 +832,8 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;
case SO_RCVBUFFORCE:
@@ -1121,7 +1123,7 @@ set_rcvbuf:
break;
}
case SO_INCOMING_CPU:
- sk->sk_incoming_cpu = val;
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
break;
case SO_CNX_ADVICE:
@@ -1470,7 +1472,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_INCOMING_CPU:
- v.val = sk->sk_incoming_cpu;
+ v.val = READ_ONCE(sk->sk_incoming_cpu);
break;
case SO_MEMINFO:
@@ -1673,6 +1675,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
+ sk_tx_queue_clear(sk);
}
return sk;
@@ -1826,8 +1829,11 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
- mem_cgroup_sk_alloc(newsk);
- cgroup_sk_alloc(&newsk->sk_cgrp_data);
+
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
+
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
@@ -1883,6 +1889,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
*/
sk_refcnt_debug_inc(newsk);
sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
RCU_INIT_POINTER(newsk->sk_wq, NULL);
if (newsk->sk_prot->sockets_allocated)
@@ -2081,8 +2088,10 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force ||
+ refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
struct sk_buff *skb = alloc_skb(size, priority);
+
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
@@ -2183,7 +2192,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -2218,7 +2227,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
break;
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -2720,6 +2729,27 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *
}
EXPORT_SYMBOL(sock_no_mmap);
+/*
+ * When a file is received (via SCM_RIGHTS, etc), we must bump the
+ * various sock-based usage counts.
+ */
+void __receive_sock(struct file *file)
+{
+ struct socket *sock;
+ int error;
+
+ /*
+ * The resulting value of "error" is ignored here since we only
+ * need to take action when the file is a socket and testing
+ * "sock" for NULL is sufficient.
+ */
+ sock = sock_from_file(file, &error);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
+}
+
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
ssize_t res;
@@ -2799,7 +2829,7 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
@@ -2907,7 +2937,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0UL;
- sk->sk_pacing_shift = 10;
+ WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
sk_rx_queue_clear(sk);
@@ -3197,13 +3227,13 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
- mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
- mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
- mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
- mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}
@@ -3578,7 +3608,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
{
struct sock *sk = p;
- return !skb_queue_empty(&sk->sk_receive_queue) ||
+ return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 3b14de0e36d2..285976f79cba 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -238,20 +238,26 @@ static void sock_map_free(struct bpf_map *map)
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
- rcu_read_lock();
- raw_spin_lock_bh(&stab->lock);
for (i = 0; i < stab->map.max_entries; i++) {
struct sock **psk = &stab->sks[i];
struct sock *sk;
sk = xchg(psk, NULL);
- if (sk)
+ if (sk) {
+ lock_sock(sk);
+ rcu_read_lock();
sock_map_unref(sk, psk);
+ release_sock(sk);
+ rcu_read_unlock();
+ }
}
- raw_spin_unlock_bh(&stab->lock);
- rcu_read_unlock();
+ /* wait for psock readers accessing its map link */
synchronize_rcu();
bpf_map_area_free(stab->sks);
@@ -418,14 +424,16 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
ret = -EINVAL;
goto out;
}
- if (!sock_map_sk_is_suitable(sk) ||
- sk->sk_state != TCP_ESTABLISHED) {
+ if (!sock_map_sk_is_suitable(sk)) {
ret = -EOPNOTSUPP;
goto out;
}
sock_map_sk_acquire(sk);
- ret = sock_map_update_common(map, idx, sk, flags);
+ if (sk->sk_state != TCP_ESTABLISHED)
+ ret = -EOPNOTSUPP;
+ else
+ ret = sock_map_update_common(map, idx, sk, flags);
sock_map_sk_release(sk);
out:
fput(sock->file);
@@ -741,14 +749,16 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key,
ret = -EINVAL;
goto out;
}
- if (!sock_map_sk_is_suitable(sk) ||
- sk->sk_state != TCP_ESTABLISHED) {
+ if (!sock_map_sk_is_suitable(sk)) {
ret = -EOPNOTSUPP;
goto out;
}
sock_map_sk_acquire(sk);
- ret = sock_hash_update_common(map, key, sk, flags);
+ if (sk->sk_state != TCP_ESTABLISHED)
+ ret = -EOPNOTSUPP;
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
out:
fput(sock->file);
@@ -856,22 +866,49 @@ static void sock_hash_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct bpf_htab_bucket *bucket;
+ struct hlist_head unlink_list;
struct bpf_htab_elem *elem;
struct hlist_node *node;
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
- rcu_read_lock();
for (i = 0; i < htab->buckets_num; i++) {
bucket = sock_hash_select_bucket(htab, i);
+
+ /* We are racing with sock_hash_delete_from_link to
+ * enter the spin-lock critical section. Every socket on
+ * the list is still linked to sockhash. Since link
+ * exists, psock exists and holds a ref to socket. That
+ * lets us to grab a socket ref too.
+ */
raw_spin_lock_bh(&bucket->lock);
- hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
- hlist_del_rcu(&elem->node);
+ hlist_for_each_entry(elem, &bucket->head, node)
+ sock_hold(elem->sk);
+ hlist_move_list(&bucket->head, &unlink_list);
+ raw_spin_unlock_bh(&bucket->lock);
+
+ /* Process removed entries out of atomic context to
+ * block for socket lock before deleting the psock's
+ * link to sockhash.
+ */
+ hlist_for_each_entry_safe(elem, node, &unlink_list, node) {
+ hlist_del(&elem->node);
+ rcu_read_lock();
+ lock_sock(elem->sk);
sock_map_unref(elem->sk, elem);
+ release_sock(elem->sk);
+ sock_put(elem->sk);
+ sock_hash_free_elem(htab, elem);
+ rcu_read_unlock();
}
- raw_spin_unlock_bh(&bucket->lock);
}
- rcu_read_unlock();
+
+ /* wait for psock readers accessing its map link */
+ synchronize_rcu();
bpf_map_area_free(htab->buckets);
kfree(htab);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 2f89777763ad..3f1fe0958ff6 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -112,6 +112,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
+ more_reuse->has_conns = reuse->has_conns;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f9204719aeee..50eeb0ef320e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -290,6 +290,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
return ret;
}
+# ifdef CONFIG_HAVE_EBPF_JIT
static int
proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -300,6 +301,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
+# endif /* CONFIG_HAVE_EBPF_JIT */
static int
proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
diff --git a/net/core/utils.c b/net/core/utils.c
index 6b6e51db9f3b..1f31a39236d5 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -438,6 +438,23 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
+/**
+ * inet_proto_csum_replace16 - update layer 4 header checksum field
+ * @sum: Layer 4 header checksum field
+ * @skb: sk_buff for the packet
+ * @from: old IPv6 address
+ * @to: new IPv6 address
+ * @pseudohdr: True if layer 4 header checksum includes pseudoheader
+ *
+ * Update layer 4 header as per the update in IPv6 src/dst address.
+ *
+ * There is no need to update skb->csum in this function, because update in two
+ * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other
+ * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to
+ * update skb->csum, because update in 3 fields a.) IPv4 src/dst address,
+ * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as
+ * L4 Header checksum for skb->csum calculation.
+ */
void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
const __be32 *from, const __be32 *to,
bool pseudohdr)
@@ -449,9 +466,6 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_PARTIAL) {
*sum = csum_fold(csum_partial(diff, sizeof(diff),
~csum_unfold(*sum)));
- if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_partial(diff, sizeof(diff),
- ~skb->csum);
} else if (pseudohdr)
*sum = ~csum_fold(csum_partial(diff, sizeof(diff),
csum_unfold(*sum)));