diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/bpf_sk_storage.c | 5 | ||||
-rw-r--r-- | net/core/dev.c | 158 | ||||
-rw-r--r-- | net/core/filter.c | 220 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 43 | ||||
-rw-r--r-- | net/core/gro.c | 114 | ||||
-rw-r--r-- | net/core/gso.c | 273 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 7 | ||||
-rw-r--r-- | net/core/net-traces.c | 2 | ||||
-rw-r--r-- | net/core/net_namespace.c | 4 | ||||
-rw-r--r-- | net/core/netdev-genl-gen.c | 2 | ||||
-rw-r--r-- | net/core/netdev-genl-gen.h | 2 | ||||
-rw-r--r-- | net/core/netpoll.c | 5 | ||||
-rw-r--r-- | net/core/pktgen.c | 13 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 220 | ||||
-rw-r--r-- | net/core/skbuff.c | 367 | ||||
-rw-r--r-- | net/core/skmsg.c | 22 | ||||
-rw-r--r-- | net/core/sock.c | 282 | ||||
-rw-r--r-- | net/core/sock_map.c | 16 | ||||
-rw-r--r-- | net/core/xdp.c | 2 |
20 files changed, 1117 insertions, 642 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 8f367813bc68..731db2eaa610 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -13,7 +13,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o gro.o \ - netdev-genl.o netdev-genl-gen.o + netdev-genl.o netdev-genl-gen.o gso.o obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d4172534dfa8..cca7594be92e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -496,8 +496,11 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); nr_maps++; + } } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); diff --git a/net/core/dev.c b/net/core/dev.c index c29f3e1db3ca..69a3e544676c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -758,29 +758,43 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) } EXPORT_SYMBOL(dev_get_by_name_rcu); +/* Deprecated for new users, call netdev_get_by_name() instead */ +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + /** - * dev_get_by_name - find a device by its name + * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use dev_put() to + * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ - -struct net_device *dev_get_by_name(struct net *net, const char *name) +struct net_device *netdev_get_by_name(struct net *net, const char *name, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); - dev_hold(dev); - rcu_read_unlock(); + dev = dev_get_by_name(net, name); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex @@ -831,29 +845,42 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index_rcu); +/* Deprecated for new users, call netdev_get_by_index() instead */ +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); /** - * dev_get_by_index - find a device by its ifindex + * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * netdev_put() to indicate they have finished with it. */ - -struct net_device *dev_get_by_index(struct net *net, int ifindex) +struct net_device *netdev_get_by_index(struct net *net, int ifindex, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifindex); - dev_hold(dev); - rcu_read_unlock(); + dev = dev_get_by_index(net, ifindex); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id @@ -3209,7 +3236,7 @@ static u16 skb_tx_hash(const struct net_device *dev, return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } -static void skb_warn_bad_offload(const struct sk_buff *skb) +void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; @@ -3338,74 +3365,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) return vlan_get_protocol_and_depth(skb, type, depth); } -/* openvswitch calls this on rx path, so we need a different check. - */ -static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) -{ - if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_UNNECESSARY; - - return skb->ip_summed == CHECKSUM_NONE; -} - -/** - * __skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @tx_path: whether it is called in TX path - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - * - * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. - */ -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, bool tx_path) -{ - struct sk_buff *segs; - - if (unlikely(skb_needs_check(skb, tx_path))) { - int err; - - /* We're going to init ->check field in TCP or UDP header */ - err = skb_cow_head(skb, 0); - if (err < 0) - return ERR_PTR(err); - } - - /* Only report GSO partial support if it will enable us to - * support segmentation on this frame without needing additional - * work. - */ - if (features & NETIF_F_GSO_PARTIAL) { - netdev_features_t partial_features = NETIF_F_GSO_ROBUST; - struct net_device *dev = skb->dev; - - partial_features |= dev->features & dev->gso_partial_features; - if (!skb_gso_ok(skb, features | partial_features)) - features &= ~NETIF_F_GSO_PARTIAL; - } - - BUILD_BUG_ON(SKB_GSO_CB_OFFSET + - sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); - - SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); - SKB_GSO_CB(skb)->encap_level = 0; - - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - segs = skb_mac_gso_segment(skb, features); - - if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) - skb_warn_bad_offload(skb); - - return segs; -} -EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG @@ -6199,7 +6158,8 @@ restart: if (!napi) goto out; - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { int work = 0; @@ -6241,7 +6201,8 @@ count: if (unlikely(need_resched())) { if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); rcu_read_unlock(); cond_resched(); if (loop_end(loop_end_arg, start_time)) @@ -6252,7 +6213,8 @@ count: } if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); out: rcu_read_unlock(); } @@ -8822,9 +8784,11 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; - err = ops->ndo_set_mac_address(dev, sa); - if (err) - return err; + if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, sa); + if (err) + return err; + } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -10570,8 +10534,10 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) { WARN_ON(dev->reg_state == NETREG_REGISTERED); - dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + dev->gro_flush_timeout = 20000; + dev->napi_defer_hard_irqs = 1; + } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); @@ -10632,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - ref_tracker_dir_init(&dev->refcnt_tracker, 128); + ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) diff --git a/net/core/filter.c b/net/core/filter.c index d9ce04ca22ce..f1a577540065 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3948,20 +3948,21 @@ void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { - struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); u32 size = xdp->data_end - xdp->data; + struct skb_shared_info *sinfo; void *addr = xdp->data; int i; if (unlikely(offset > 0xffff || len > 0xffff)) return ERR_PTR(-EFAULT); - if (offset + len > xdp_get_buff_len(xdp)) + if (unlikely(offset + len > xdp_get_buff_len(xdp))) return ERR_PTR(-EINVAL); - if (offset < size) /* linear area */ + if (likely(offset < size)) /* linear area */ goto out; + sinfo = xdp_get_shared_info_from_buff(xdp); offset -= size; for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ u32 frag_size = skb_frag_size(&sinfo->frags[i]); @@ -4115,12 +4116,6 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) if (unlikely(data_end > data_hard_end)) return -EINVAL; - /* ALL drivers MUST init xdp->frame_sz, chicken check below */ - if (unlikely(xdp->frame_sz > PAGE_SIZE)) { - WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); - return -EINVAL; - } - if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; @@ -5803,6 +5798,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -5936,6 +5937,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -6008,7 +6015,7 @@ set_fwd_params: #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ - BPF_FIB_LOOKUP_SKIP_NEIGH) + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -6555,12 +6562,11 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) + u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; - int sdif; if (len == sizeof(tuple->ipv4)) family = AF_INET; @@ -6572,10 +6578,12 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; - if (family == AF_INET) - sdif = inet_sdif(skb); - else - sdif = inet6_sdif(skb); + if (sdif < 0) { + if (family == AF_INET) + sdif = inet_sdif(skb); + else + sdif = inet6_sdif(skb); + } if ((s32)netns_id < 0) { net = caller_net; @@ -6595,10 +6603,11 @@ out: static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) + u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, - ifindex, proto, netns_id, flags); + ifindex, proto, netns_id, flags, + sdif); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); @@ -6638,7 +6647,7 @@ bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, } return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, - netns_id, flags); + netns_id, flags, -1); } static struct sock * @@ -6727,6 +6736,78 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { + .func = bpf_tc_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { + .func = bpf_tc_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { + .func = bpf_tc_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_sk_release, struct sock *, sk) { if (sk && sk_is_refcounted(sk)) @@ -6744,12 +6825,13 @@ static const struct bpf_func_proto bpf_sk_release_proto = { BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, - flags); + flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -6767,12 +6849,13 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); } static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { @@ -6790,12 +6873,13 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -6815,7 +6899,8 @@ BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); + IPPROTO_TCP, netns_id, flags, + -1); } static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { @@ -6834,7 +6919,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, - netns_id, flags); + netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -6853,7 +6938,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, - netns_id, flags); + netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { @@ -6916,6 +7001,8 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_tcp_sock); + switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != @@ -7265,6 +7352,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -ENETUNREACH; if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) return -ESOCKTNOSUPPORT; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; @@ -7980,9 +8069,9 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: - return &bpf_sk_lookup_tcp_proto; + return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: - return &bpf_sk_lookup_udp_proto; + return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: @@ -7990,7 +8079,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: - return &bpf_skc_lookup_tcp_proto; + return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: @@ -11721,3 +11810,66 @@ static int __init bpf_kfunc_init(void) return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); } late_initcall(bpf_kfunc_init); + +/* Disables missing prototype warnings */ +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. + * + * The function expects a non-NULL pointer to a socket, and invokes the + * protocol specific socket destroy handlers. + * + * The helper can only be called from BPF contexts that have acquired the socket + * locks. + * + * Parameters: + * @sock: Pointer to socket to be destroyed + * + * Return: + * On error, may return EPROTONOSUPPORT, EINVAL. + * EPROTONOSUPPORT if protocol specific destroy handler is not supported. + * 0 otherwise + */ +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) +{ + struct sock *sk = (struct sock *)sock; + + /* The locking semantics that allow for synchronous execution of the + * destroy handlers are only supported for TCP and UDP. + * Supporting protocols will need to acquire sock lock in the BPF context + * prior to invoking this kfunc. + */ + if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && + sk->sk_protocol != IPPROTO_UDP)) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, ECONNABORTED); +} + +__diag_pop() + +BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_sk_iter_kfunc_ids) + +static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && + prog->expected_attach_type != BPF_TRACE_ITER) + return -EACCES; + return 0; +} + +static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_sk_iter_kfunc_ids, + .filter = tracing_iter_filter, +}; + +static int init_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25fb0bbc310f..6bed3992df81 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -27,6 +27,7 @@ #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> +#include <net/pkt_cls.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> @@ -241,6 +242,15 @@ void skb_flow_dissect_meta(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (tc_skb_ext_tc_enabled()) { + struct tc_skb_ext *ext; + + ext = skb_ext_find(skb, TC_SKB_EXT); + if (ext) + meta->l2_miss = ext->l2_miss; + } +#endif } EXPORT_SYMBOL(skb_flow_dissect_meta); @@ -548,6 +558,30 @@ __skb_flow_dissect_arp(const struct sk_buff *skb, } static enum flow_dissect_ret +__skb_flow_dissect_cfm(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_cfm *key, *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, + target_container); + + key->mdl_ver = hdr->mdl_ver; + key->opcode = hdr->opcode; + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, @@ -1390,6 +1424,12 @@ proto_again: break; } + case htons(ETH_P_CFM): + fdret = __skb_flow_dissect_cfm(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; @@ -1740,8 +1780,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb) memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, - &keys, NULL, 0, 0, 0, - FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } diff --git a/net/core/gro.c b/net/core/gro.c index 2d84165cb4f1..0759277dc14e 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -10,7 +10,7 @@ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); -static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ int gro_normal_batch __read_mostly = 8; @@ -92,63 +92,6 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); -/** - * skb_eth_gso_segment - segmentation handler for ethernet protocols. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @type: Ethernet Protocol ID - */ -struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, - netdev_features_t features, __be16 type) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - return segs; -} -EXPORT_SYMBOL(skb_eth_gso_segment); - -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - int vlan_depth = skb->mac_len; - __be16 type = skb_network_protocol(skb, &vlan_depth); - - if (unlikely(!type)) - return ERR_PTR(-EINVAL); - - __skb_pull(skb, vlan_depth); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { @@ -239,9 +182,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, first_offset); - skb_frag_size_set(frag, first_size); + skb_frag_fill_page_desc(frag, page, first_offset, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ @@ -363,6 +304,24 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); +static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, + const struct sk_buff *p, + unsigned long diffs) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; + + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + return diffs; +} + static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) { @@ -397,23 +356,11 @@ static void gro_list_prepare(const struct list_head *head, * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *skb_ext; - struct tc_skb_ext *p_ext; -#endif - diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext = skb_ext_find(skb, TC_SKB_EXT); - p_ext = skb_ext_find(p, TC_SKB_EXT); - - diffs |= (!!p_ext) ^ (!!skb_ext); - if (!diffs && unlikely(skb_ext)) - diffs |= p_ext->chain ^ skb_ext->chain; -#endif + diffs |= gro_list_prepare_tc_ext(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; @@ -460,6 +407,14 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } +static void gro_try_pull_from_frag0(struct sk_buff *skb) +{ + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + if (grow > 0) + gro_pull_from_frag0(skb, grow); +} + static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; @@ -489,7 +444,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; - int grow; if (netif_elide_gro(skb->dev)) goto normal; @@ -564,17 +518,14 @@ found_ptype: else gro_list->count++; + /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ + gro_try_pull_from_frag0(skb); NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; if (!skb_is_gso(skb)) skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; - -pull: - grow = skb_gro_offset(skb) - skb_headlen(skb); - if (grow > 0) - gro_pull_from_frag0(skb, grow); ok: if (gro_list->count) { if (!test_bit(bucket, &napi->gro_bitmask)) @@ -587,7 +538,8 @@ ok: normal: ret = GRO_NORMAL; - goto pull; + gro_try_pull_from_frag0(skb); + goto ok; } struct packet_offload *gro_find_receive_by_type(__be16 type) diff --git a/net/core/gso.c b/net/core/gso.c new file mode 100644 index 000000000000..9e1803bfc9c6 --- /dev/null +++ b/net/core/gso.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/skbuff.h> +#include <linux/sctp.h> +#include <net/gso.h> +#include <net/gro.h> + +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); +/* openvswitch calls this on rx path, so we need a different check. + */ +static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + struct sk_buff *segs; + + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + /* We're going to init ->check field in TCP or UDP header */ + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + } + + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + segs = skb_mac_gso_segment(skb, features); + + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + skb_warn_bad_offload(skb); + + return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; + + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } else if (unlikely(skb_is_gso_sctp(skb))) { + thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); + } + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return thlen + shinfo->gso_size; +} + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS + * + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. + * + * We might want to check: + * - L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @skb: GSO skb + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. + */ +static inline bool skb_gso_size_check(const struct sk_buff *skb, + unsigned int seg_len, + unsigned int max_len) { + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + + if (shinfo->gso_size != GSO_BY_FRAGS) + return seg_len <= max_len; + + /* Undo this so we can re-use header sizes */ + seg_len -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (seg_len + skb_headlen(iter) > max_len) + return false; + } + + return true; +} + +/** + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. + */ +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) +{ + return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); + +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ + return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 8b6b5e72b217..4a0797f0a154 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -60,9 +60,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, ret = BPF_OK; } else { skb_reset_mac_header(skb); - ret = skb_do_redirect(skb); - if (ret == 0) - ret = BPF_REDIRECT; + skb_do_redirect(skb); + ret = BPF_REDIRECT; } break; @@ -255,7 +254,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) - return err; + return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 805b7385dd8d..6aef976bc1da 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -63,4 +63,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); +EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb); + EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3e3598cd49f2..f4183c4c1ec8 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(get_net_ns_by_id); /* init code that must occur even if setup_net() is not called. */ static __net_init void preinit_net(struct net *net) { - ref_tracker_dir_init(&net->notrefcnt_tracker, 128); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); } /* @@ -322,7 +322,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(net_exit_list); refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index de17ca2f7dbf..ea9231378aa6 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -8,7 +8,7 @@ #include "netdev-genl-gen.h" -#include <linux/netdev.h> +#include <uapi/linux/netdev.h> /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 74d74fc23167..7b370c073e7d 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -9,7 +9,7 @@ #include <net/netlink.h> #include <net/genetlink.h> -#include <linux/netdev.h> +#include <uapi/linux/netdev.h> int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e6a739b1afa9..543007f159f9 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -690,7 +690,7 @@ int netpoll_setup(struct netpoll *np) err = -ENODEV; goto unlock; } - dev_hold(ndev); + netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); @@ -783,12 +783,11 @@ put_noaddr: err = __netpoll_setup(np, ndev); if (err) goto put; - netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL); rtnl_unlock(); return 0; put: - dev_put(ndev); + netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 760238196db1..f56b8d697014 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2785,14 +2785,17 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, break; } get_page(pkt_dev->page); - skb_frag_set_page(skb, i, pkt_dev->page); - skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); + /*last fragment, fill rest of data*/ if (i == (frags - 1)) - skb_frag_size_set(&skb_shinfo(skb)->frags[i], - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, + (datalen < PAGE_SIZE ? + datalen : PAGE_SIZE)); else - skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 41de3a2f29e1..00c94d9622b4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -961,24 +961,27 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + - nla_total_size(0) + /* nest IFLA_VF_STATS */ - /* IFLA_VF_STATS_RX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_BROADCAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_MULTICAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + size += num_vfs * + (nla_total_size(0) + /* nest IFLA_VF_STATS */ + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64))); + } return size; } else return 0; @@ -1270,7 +1273,8 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo) + struct nlattr *vfinfo, + u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; @@ -1376,33 +1380,35 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); - memset(&vf_stats, 0, sizeof(vf_stats)); - if (dev->netdev_ops->ndo_get_vf_stats) - dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, - &vf_stats); - vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); - if (!vfstats) - goto nla_put_vf_failure; - if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, - vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, - vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { - nla_nest_cancel(skb, vfstats); - goto nla_put_vf_failure; + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); + if (!vfstats) + goto nla_put_vf_failure; + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfstats); } - nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; @@ -1435,7 +1441,7 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) return -EMSGSIZE; } @@ -2262,13 +2268,27 @@ out_err: return err; } -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr) +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr) { - return nla_parse_deprecated(tb, IFLA_MAX, head, len, ifla_policy, + const struct ifinfomsg *ifmp; + const struct nlattr *attrs; + size_t len; + + ifmp = nla_data(nla_peer); + attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); + len = nla_len(nla_peer) - sizeof(struct ifinfomsg); + + if (ifmp->ifi_index < 0) { + NL_SET_ERR_MSG_ATTR(exterr, nla_peer, + "ifindex can't be negative"); + return -EINVAL; + } + + return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } -EXPORT_SYMBOL(rtnl_nla_parse_ifla); +EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { @@ -2377,45 +2397,43 @@ static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { - if (dev) { - if (tb[IFLA_ADDRESS] && - nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) - return -EINVAL; + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; - if (tb[IFLA_BROADCAST] && - nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) - return -EINVAL; + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; - if (tb[IFLA_GSO_MAX_SIZE] && - nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { - NL_SET_ERR_MSG(extack, "too big gso_max_size"); - return -EINVAL; - } + if (tb[IFLA_GSO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_max_size"); + return -EINVAL; + } - if (tb[IFLA_GSO_MAX_SEGS] && - (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || - nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { - NL_SET_ERR_MSG(extack, "too big gso_max_segs"); - return -EINVAL; - } + if (tb[IFLA_GSO_MAX_SEGS] && + (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || + nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { + NL_SET_ERR_MSG(extack, "too big gso_max_segs"); + return -EINVAL; + } - if (tb[IFLA_GRO_MAX_SIZE] && - nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_max_size"); - return -EINVAL; - } + if (tb[IFLA_GRO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } - if (tb[IFLA_GSO_IPV4_MAX_SIZE] && - nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { - NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); - return -EINVAL; - } + if (tb[IFLA_GSO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); + return -EINVAL; + } - if (tb[IFLA_GRO_IPV4_MAX_SIZE] && - nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); - return -EINVAL; - } + if (tb[IFLA_GRO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); + return -EINVAL; } if (tb[IFLA_AF_SPEC]) { @@ -2736,10 +2754,6 @@ static int do_setlink(const struct sk_buff *skb, char ifname[IFNAMSIZ]; int err; - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -3156,6 +3170,10 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + goto errout; + err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; @@ -3399,6 +3417,9 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; err = do_setlink(skb, dev, ifm, extack, tb, 0); if (err < 0) return err; @@ -3540,6 +3561,9 @@ replay: if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(net, ifm->ifi_index); + } else if (ifm->ifi_index < 0) { + NL_SET_ERR_MSG(extack, "ifindex can't be negative"); + return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(net, tb); @@ -3556,10 +3580,6 @@ replay: m_ops = master_dev->rtnl_link_ops; } - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - if (tb[IFLA_LINKINFO]) { err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], @@ -3623,6 +3643,10 @@ replay: if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; + if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) @@ -4090,7 +4114,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) + if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) @@ -4104,10 +4128,10 @@ nla_put_failure: return -EMSGSIZE; } -static inline size_t rtnl_fdb_nlmsg_size(void) +static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + - nla_total_size(ETH_ALEN) + /* NDA_LLADDR */ + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } @@ -4119,7 +4143,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC); + skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; @@ -5133,13 +5157,17 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !have_flags) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); - break; + } + + if (nla_type(attr) == IFLA_BRIDGE_MODE) { + if (nla_len(attr) < sizeof(u16)) + return -EINVAL; } } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cea28d30abb5..7dfae58055c2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -67,6 +67,7 @@ #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gso.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> @@ -92,15 +93,7 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init; static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -/* skb_small_head_cache and related code is only supported - * for CONFIG_SLAB and CONFIG_SLUB. - * As soon as SLOB is removed from the kernel, we can clean up this. - */ -#if !defined(CONFIG_SLOB) -# define HAVE_SKB_SMALL_HEAD_CACHE 1 -#endif -#ifdef HAVE_SKB_SMALL_HEAD_CACHE static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) @@ -117,7 +110,6 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -#endif /* HAVE_SKB_SMALL_HEAD_CACHE */ int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -558,11 +550,10 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, bool *pfmemalloc) { bool ret_pfmemalloc = false; - unsigned int obj_size; + size_t obj_size; void *obj; obj_size = SKB_HEAD_ALIGN(*size); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { obj = kmem_cache_alloc_node(skb_small_head_cache, @@ -576,8 +567,13 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node); goto out; } -#endif - *size = obj_size = kmalloc_size_roundup(obj_size); + + obj_size = kmalloc_size_roundup(obj_size); + /* The following cast might truncate high-order bits of obj_size, this + * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. + */ + *size = (unsigned int)obj_size; + /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. @@ -898,11 +894,9 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) static void skb_kfree_head(void *head, unsigned int end_offset) { -#ifdef HAVE_SKB_SMALL_HEAD_CACHE if (end_offset == SKB_SMALL_HEAD_HEADROOM) kmem_cache_free(skb_small_head_cache, head); else -#endif kfree(head); } @@ -2160,7 +2154,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) if (likely(skb_end_offset(skb) == saved_end_offset)) return 0; -#ifdef HAVE_SKB_SMALL_HEAD_CACHE /* We can not change skb->end if the original or new value * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). */ @@ -2174,7 +2167,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) WARN_ON_ONCE(1); return 0; } -#endif shinfo = skb_shinfo(skb); @@ -3003,32 +2995,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits); -static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t size) +static int sendmsg_locked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; + size_t size = msg_data_left(msg); if (!sock) return -EINVAL; - return kernel_sendmsg(sock, msg, vec, num, size); + + if (!sock->ops->sendmsg_locked) + return sock_no_sendmsg_locked(sk, msg, size); + + return sock->ops->sendmsg_locked(sk, msg, size); } -static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; if (!sock) return -EINVAL; - return kernel_sendpage(sock, page, offset, size, flags); + return sock_sendmsg(sock, msg); } -typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t size); -typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, - size_t size, int flags); +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg, sendpage_func sendpage) + int len, sendmsg_func sendmsg) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -3048,8 +3040,9 @@ do_frag_list: memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; - ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, - sendmsg_unlocked, sk, &msg, &kv, 1, slen); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; @@ -3080,11 +3073,18 @@ do_frag_list: slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, - sendpage_unlocked, sk, - skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + }; + + bvec_set_page(&bvec, skb_frag_page(frag), slen, + skb_frag_off(frag) + offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, + slen); + + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; @@ -3121,16 +3121,14 @@ error: int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, - kernel_sendpage_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, - sendpage_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); } /** @@ -4203,13 +4201,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size) + int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - } else if (i < MAX_SKB_FRAGS) { + } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); @@ -4249,10 +4247,9 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) struct page *page; page = virt_to_head_page(frag_skb->head); - __skb_frag_set_page(&head_frag, page); - skb_frag_off_set(&head_frag, frag_skb->data - - (unsigned char *)page_address(page)); - skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); + skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - + (unsigned char *)page_address(page), + skb_headlen(frag_skb)); return head_frag; } @@ -4270,6 +4267,11 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(skb, -skb_network_offset(skb) + offset); + /* Ensure the head is writeable before touching the shared info */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto err_linearize; + skb_shinfo(skb)->frag_list = NULL; while (list_skb) { @@ -4358,21 +4360,20 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; - skb_frag_t *frag = skb_shinfo(head_skb)->frags; unsigned int mss = skb_shinfo(head_skb)->gso_size; unsigned int doffset = head_skb->data - skb_mac_header(head_skb); - struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int partial_segs = 0; unsigned int headroom; unsigned int len = head_skb->len; + struct sk_buff *frag_skb; + skb_frag_t *frag; __be16 proto; bool csum, sg; - int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; - int pos; + int nfrags, pos; if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { @@ -4449,6 +4450,13 @@ normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) + return ERR_PTR(-ENOMEM); + + nfrags = skb_shinfo(head_skb)->nr_frags; + frag = skb_shinfo(head_skb)->frags; + frag_skb = head_skb; + do { struct sk_buff *nskb; skb_frag_t *nskb_frag; @@ -4469,6 +4477,10 @@ normal: (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); + nskb = skb_clone(list_skb, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -4487,12 +4499,8 @@ normal: frag++; } - nskb = skb_clone(list_skb, GFP_ATOMIC); list_skb = list_skb->next; - if (unlikely(!nskb)) - goto err; - if (unlikely(pskb_trim(nskb, len))) { kfree_skb(nskb); goto err; @@ -4568,12 +4576,16 @@ normal: skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & SKBFL_SHARED_FRAG; - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) goto err; while (pos < offset + len) { if (i >= nfrags) { + if (skb_orphan_frags(list_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, list_skb, + GFP_ATOMIC)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -4587,10 +4599,6 @@ normal: i--; frag--; } - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, - GFP_ATOMIC)) - goto err; list_skb = list_skb->next; } @@ -4768,7 +4776,6 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. * struct skb_shared_info is located at the end of skb->head, * and should not be copied to/from user. @@ -4780,7 +4787,6 @@ void __init skb_init(void) 0, SKB_SMALL_HEAD_HEADROOM, NULL); -#endif skb_extensions_init(); } @@ -5143,7 +5149,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); @@ -5199,21 +5205,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, { struct sk_buff *skb; bool tsonly, opt_stats = false; + u32 tsflags; if (!sk) return; - if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; - tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); @@ -5784,147 +5792,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) } EXPORT_SYMBOL_GPL(skb_scrub_packet); -/** - * skb_gso_transport_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_transport_seglen is used to determine the real size of the - * individual segments, including Layer4 headers (TCP/UDP). - * - * The MAC/L2 or network (IP, IPv6) headers are not accounted for. - */ -static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) -{ - const struct skb_shared_info *shinfo = skb_shinfo(skb); - unsigned int thlen = 0; - - if (skb->encapsulation) { - thlen = skb_inner_transport_header(skb) - - skb_transport_header(skb); - - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - thlen += inner_tcp_hdrlen(skb); - } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { - thlen = tcp_hdrlen(skb); - } else if (unlikely(skb_is_gso_sctp(skb))) { - thlen = sizeof(struct sctphdr); - } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { - thlen = sizeof(struct udphdr); - } - /* UFO sets gso_size to the size of the fragmentation - * payload, i.e. the size of the L4 (UDP) header is already - * accounted for. - */ - return thlen + shinfo->gso_size; -} - -/** - * skb_gso_network_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_network_seglen is used to determine the real size of the - * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). - * - * The MAC/L2 header is not accounted for. - */ -static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - - skb_network_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_mac_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_mac_seglen is used to determine the real size of the - * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 - * headers (TCP/UDP). - */ -static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS - * - * There are a couple of instances where we have a GSO skb, and we - * want to determine what size it would be after it is segmented. - * - * We might want to check: - * - L3+L4+payload size (e.g. IP forwarding) - * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) - * - * This is a helper to do that correctly considering GSO_BY_FRAGS. - * - * @skb: GSO skb - * - * @seg_len: The segmented length (from skb_gso_*_seglen). In the - * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. - * - * @max_len: The maximum permissible length. - * - * Returns true if the segmented length <= max length. - */ -static inline bool skb_gso_size_check(const struct sk_buff *skb, - unsigned int seg_len, - unsigned int max_len) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); - const struct sk_buff *iter; - - if (shinfo->gso_size != GSO_BY_FRAGS) - return seg_len <= max_len; - - /* Undo this so we can re-use header sizes */ - seg_len -= GSO_BY_FRAGS; - - skb_walk_frags(skb, iter) { - if (seg_len + skb_headlen(iter) > max_len) - return false; - } - - return true; -} - -/** - * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? - * - * @skb: GSO skb - * @mtu: MTU to validate against - * - * skb_gso_validate_network_len validates if a given skb will fit a - * wanted MTU once split. It considers L3 headers, L4 headers, and the - * payload. - */ -bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) -{ - return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); - -/** - * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? - * - * @skb: GSO skb - * @len: length to validate against - * - * skb_gso_validate_mac_len validates if a given skb will fit a wanted - * length once split, including L2, L3 and L4 headers and the payload. - */ -bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) -{ - return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); - static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { int mac_len, meta_len; @@ -6912,3 +6779,91 @@ nodefer: __kfree_skb(skb); if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } + +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, + size_t offset, size_t len) +{ + const char *kaddr; + __wsum csum; + + kaddr = kmap_local_page(page); + csum = csum_partial(kaddr + offset, len, 0); + kunmap_local(kaddr); + skb->csum = csum_block_add(skb->csum, csum, skb->len); +} + +/** + * skb_splice_from_iter - Splice (or copy) pages to skbuff + * @skb: The buffer to add pages to + * @iter: Iterator representing the pages to be added + * @maxsize: Maximum amount of pages to be added + * @gfp: Allocation flags + * + * This is a common helper function for supporting MSG_SPLICE_PAGES. It + * extracts pages from an iterator and adds them to the socket buffer if + * possible, copying them to fragments if not possible (such as if they're slab + * pages). + * + * Returns the amount of data spliced/copied or -EMSGSIZE if there's + * insufficient space in the buffer to transfer anything. + */ +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize, gfp_t gfp) +{ + size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + struct page *pages[8], **ppages = pages; + ssize_t spliced = 0, ret = 0; + unsigned int i; + + while (iter->count > 0) { + ssize_t space, nr, len; + size_t off; + + ret = -EMSGSIZE; + space = frag_limit - skb_shinfo(skb)->nr_frags; + if (space < 0) + break; + + /* We might be able to coalesce without increasing nr_frags */ + nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); + + len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); + if (len <= 0) { + ret = len ?: -EIO; + break; + } + + i = 0; + do { + struct page *page = pages[i++]; + size_t part = min_t(size_t, PAGE_SIZE - off, len); + + ret = -EIO; + if (WARN_ON_ONCE(!sendpage_ok(page))) + goto out; + + ret = skb_append_pagefrags(skb, page, off, part, + frag_limit); + if (ret < 0) { + iov_iter_revert(iter, len); + goto out; + } + + if (skb->ip_summed == CHECKSUM_NONE) + skb_splice_csum_page(skb, page, off, part); + + off = 0; + spliced += part; + maxsize -= part; + len -= part; + } while (len > 0); + + if (maxsize <= 0) + break; + } + +out: + skb_len_add(skb, spliced); + return spliced ?: ret; +} +EXPORT_SYMBOL(skb_splice_from_iter); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a29508e1ff35..a746dbc2f887 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -612,12 +612,18 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { + int err = 0; + if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - return sk_psock_skb_ingress(psock, skb, off, len); + skb_get(skb); + err = sk_psock_skb_ingress(psock, skb, off, len); + if (err < 0) + kfree_skb(skb); + return err; } static void sk_psock_skb_state(struct sk_psock *psock, @@ -685,9 +691,7 @@ static void sk_psock_backlog(struct work_struct *work) } while (len); skb = skb_dequeue(&psock->ingress_skb); - if (!ingress) { - kfree_skb(skb); - } + kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); @@ -1120,13 +1124,19 @@ static void sk_psock_strp_data_ready(struct sock *sk) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { + int ret; + static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; - return strp_init(&psock->strp, sk, &cb); + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) @@ -1154,7 +1164,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.stream_parser) + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else diff --git a/net/core/sock.c b/net/core/sock.c index 6e5662ca00fe..eef27812013a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -114,6 +114,9 @@ #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> +#include <linux/mroute.h> +#include <linux/mroute6.h> +#include <linux/icmpv6.h> #include <linux/uaccess.h> @@ -138,6 +141,7 @@ #include <net/tcp.h> #include <net/busy_poll.h> +#include <net/phonet/phonet.h> #include <linux/ethtool.h> @@ -425,6 +429,7 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); + long val; if (err) return err; @@ -435,7 +440,7 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, if (tv.tv_sec < 0) { static int warned __read_mostly; - *timeo_p = 0; + WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", @@ -443,11 +448,12 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, } return 0; } - *timeo_p = MAX_SCHEDULE_TIMEOUT; - if (tv.tv_sec == 0 && tv.tv_usec == 0) - return 0; - if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) - *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); + val = MAX_SCHEDULE_TIMEOUT; + if ((tv.tv_sec || tv.tv_usec) && + (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) + val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, + USEC_PER_SEC / HZ); + WRITE_ONCE(*timeo_p, val); return 0; } @@ -759,7 +765,8 @@ bool sk_mc_loop(struct sock *sk) return false; if (!sk) return true; - switch (sk->sk_family) { + /* IPV6_ADDRFORM can change sk->sk_family under us. */ + switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_sk(sk)->mc_loop; #if IS_ENABLED(CONFIG_IPV6) @@ -791,7 +798,7 @@ EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); - sk->sk_lingertime = 0; + WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } @@ -800,7 +807,7 @@ EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { lock_sock(sk); - sk->sk_priority = priority; + WRITE_ONCE(sk->sk_priority, priority); release_sock(sk); } EXPORT_SYMBOL(sock_set_priority); @@ -809,9 +816,9 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) - sk->sk_sndtimeo = secs * HZ; + WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else - sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); @@ -887,7 +894,7 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) if (!match) return -EINVAL; - sk->sk_bind_phc = phc_index; + WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } @@ -930,7 +937,7 @@ int sock_set_timestamping(struct sock *sk, int optname, return ret; } - sk->sk_tsflags = val; + WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) @@ -984,7 +991,7 @@ EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { - sk->sk_mark = val; + WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } @@ -1003,7 +1010,7 @@ static void sock_release_reserved_memory(struct sock *sk, int bytes) bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); - sk->sk_reserved_mem -= bytes; + WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } @@ -1038,9 +1045,10 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } - sk->sk_forward_alloc += pages << PAGE_SHIFT; + sk_forward_alloc_add(sk, pages << PAGE_SHIFT); - sk->sk_reserved_mem += pages << PAGE_SHIFT; + WRITE_ONCE(sk->sk_reserved_mem, + sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } @@ -1209,7 +1217,7 @@ set_sndbuf: if ((val >= 0 && val <= 6) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - sk->sk_priority = val; + WRITE_ONCE(sk->sk_priority, val); else ret = -EPERM; break; @@ -1223,15 +1231,15 @@ set_sndbuf: ret = -EFAULT; break; } - if (!ling.l_onoff) + if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); - else { -#if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + } else { + unsigned long t_sec = ling.l_linger; + + if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) + WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else -#endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; @@ -1246,6 +1254,13 @@ set_sndbuf: clear_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + if (valbool) + set_bit(SOCK_PASSPIDFD, &sock->flags); + else + clear_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: @@ -1427,7 +1442,8 @@ set_sndbuf: cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = ulval; + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); break; } @@ -1522,7 +1538,9 @@ set_sndbuf: } if ((u8)val == SOCK_TXREHASH_DEFAULT) val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* Paired with READ_ONCE() in tcp_rtx_synack() */ + /* Paired with READ_ONCE() in tcp_rtx_synack() + * and sk_getsockopt(). + */ WRITE_ONCE(sk->sk_txrehash, (u8)val); break; @@ -1622,11 +1640,11 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_SNDBUF: - v.val = sk->sk_sndbuf; + v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: - v.val = sk->sk_rcvbuf; + v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: @@ -1668,13 +1686,13 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PRIORITY: - v.val = sk->sk_priority; + v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; + v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: @@ -1700,22 +1718,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_TIMESTAMPING_OLD: lv = sizeof(v.timestamping); - v.timestamping.flags = sk->sk_tsflags; - v.timestamping.bind_phc = sk->sk_bind_phc; + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); + v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: - lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); + lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, + SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: - lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); + lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, + SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: - v.val = sk->sk_rcvlowat; + v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: @@ -1726,6 +1746,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_PEERCRED: { struct ucred peercred; @@ -1741,6 +1765,39 @@ int sk_getsockopt(struct sock *sk, int level, int optname, goto lenout; } + case SO_PEERPIDFD: + { + struct pid *peer_pid; + struct file *pidfd_file = NULL; + int pidfd; + + if (len > sizeof(pidfd)) + len = sizeof(pidfd); + + spin_lock(&sk->sk_peer_lock); + peer_pid = get_pid(sk->sk_peer_pid); + spin_unlock(&sk->sk_peer_lock); + + if (!peer_pid) + return -ENODATA; + + pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + put_pid(peer_pid); + if (pidfd < 0) + return pidfd; + + if (copy_to_sockptr(optval, &pidfd, len) || + copy_to_sockptr(optlen, &len, sizeof(int))) { + put_unused_fd(pidfd); + fput(pidfd_file); + + return -EFAULT; + } + + fd_install(pidfd, pidfd_file); + return 0; + } + case SO_PEERGROUPS: { const struct cred *cred; @@ -1767,14 +1824,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_PEERNAME: { - char address[128]; + struct sockaddr_storage address; - lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + lv = sock->ops->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_sockptr(optval, address, len)) + if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } @@ -1795,7 +1852,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, optval, optlen, len); case SO_MARK: - v.val = sk->sk_mark; + v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: @@ -1814,7 +1871,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!sock->ops->set_peek_off) return -EOPNOTSUPP; - v.val = sk->sk_peek_off; + v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); @@ -1844,7 +1901,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: - v.val = sk->sk_ll_usec; + v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); @@ -1852,12 +1909,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, #endif case SO_MAX_PACING_RATE: + /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); - v.ulval = sk->sk_max_pacing_rate; + v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ - v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); + v.val = min_t(unsigned long, ~0U, + READ_ONCE(sk->sk_max_pacing_rate)); } break; @@ -1925,11 +1984,12 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_RESERVE_MEM: - v.val = sk->sk_reserved_mem; + v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: - v.val = sk->sk_txrehash; + /* Paired with WRITE_ONCE() in sk_setsockopt() */ + v.val = READ_ONCE(sk->sk_txrehash); break; default: @@ -2550,13 +2610,24 @@ kuid_t sock_i_uid(struct sock *sk) } EXPORT_SYMBOL(sock_i_uid); -unsigned long sock_i_ino(struct sock *sk) +unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; - read_lock_bh(&sk->sk_callback_lock); + read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock_bh(&sk->sk_callback_lock); + read_unlock(&sk->sk_callback_lock); + return ino; +} +EXPORT_SYMBOL(__sock_i_ino); + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + local_bh_disable(); + ino = __sock_i_ino(sk); + local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); @@ -2675,9 +2746,9 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } @@ -2705,7 +2776,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, goto failure; err = -EPIPE; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) @@ -3067,10 +3138,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) - sk->sk_forward_alloc -= amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -3089,7 +3160,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); - if (sk_under_memory_pressure(sk) && + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } @@ -3102,14 +3173,14 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; - sk->sk_forward_alloc -= amount << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { - sk->sk_peek_off = val; + WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); @@ -3213,36 +3284,6 @@ void __receive_sock(struct file *file) } } -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg(sock, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage); - -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage_locked); - /* * Default Socket Callbacks */ @@ -3701,7 +3742,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); - mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); @@ -3998,7 +4039,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " - "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), @@ -4019,7 +4060,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), - proto_method_implemented(proto->sendpage), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), @@ -4040,7 +4080,7 @@ static int proto_seq_show(struct seq_file *seq, void *v) "maxhdr", "slab", "module", - "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; @@ -4100,3 +4140,63 @@ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); + +/* Copy 'size' bytes from userspace and return `size` back to userspace */ +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, + void __user *arg, void *karg, size_t size) +{ + int ret; + + if (copy_from_user(karg, arg, size)) + return -EFAULT; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); + if (ret) + return ret; + + if (copy_to_user(arg, karg, size)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(sock_ioctl_inout); + +/* This is the most common ioctl prep function, where the result (4 bytes) is + * copied back to userspace if the ioctl() returns successfully. No input is + * copied from userspace as input argument. + */ +static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int ret, karg = 0; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); + if (ret) + return ret; + + return put_user(karg, (int __user *)arg); +} + +/* A wrapper around sock ioctls, which copies the data from userspace + * (depending on the protocol/ioctl), and copies back the result to userspace. + * The main motivation for this function is to pass kernel memory to the + * protocol ioctl callbacks, instead of userspace memory. + */ +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int rc = 1; + + if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) + rc = ipmr_sk_ioctl(sk, cmd, arg); + else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) + rc = ip6mr_sk_ioctl(sk, cmd, arg); + else if (sk_is_phonet(sk)) + rc = phonet_sk_ioctl(sk, cmd, arg); + + /* If ioctl was processed, returns its value */ + if (rc <= 0) + return rc; + + /* Otherwise call the default handler */ + return sock_ioctl_out(sk, cmd, arg); +} +EXPORT_SYMBOL(sk_ioctl); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 00afb66cd095..8f07fea39d9e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -32,8 +32,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && @@ -117,7 +115,6 @@ static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); - preempt_disable(); rcu_read_lock(); } @@ -125,7 +122,6 @@ static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); - preempt_enable(); release_sock(sk); } @@ -150,13 +146,13 @@ static void sock_map_del_link(struct sock *sk, list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; - struct bpf_stab *stab = container_of(map, struct bpf_stab, - map); - if (psock->saved_data_ready && stab->progs.stream_parser) + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.stream_verdict) + if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -1085,8 +1081,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) struct bpf_shtab *htab; int i, err; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && diff --git a/net/core/xdp.c b/net/core/xdp.c index 41e5ca8643ec..8362130bf085 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, __diag_pop(); BTF_SET8_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, 0) +#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_SET8_END(xdp_metadata_kfunc_ids) |