diff options
Diffstat (limited to 'net')
477 files changed, 7369 insertions, 19209 deletions
diff --git a/net/802/mrp.c b/net/802/mrp.c index 5b804dbe2d08..486becf6c78d 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -606,7 +606,10 @@ static void mrp_join_timer(struct timer_list *t) spin_unlock(&app->lock); mrp_queue_xmit(app); - mrp_join_timer_arm(app); + spin_lock(&app->lock); + if (likely(app->active)) + mrp_join_timer_arm(app); + spin_unlock(&app->lock); } static void mrp_periodic_timer_arm(struct mrp_applicant *app) @@ -620,11 +623,12 @@ static void mrp_periodic_timer(struct timer_list *t) struct mrp_applicant *app = from_timer(app, t, periodic_timer); spin_lock(&app->lock); - mrp_mad_event(app, MRP_EVENT_PERIODIC); - mrp_pdu_queue(app); + if (likely(app->active)) { + mrp_mad_event(app, MRP_EVENT_PERIODIC); + mrp_pdu_queue(app); + mrp_periodic_timer_arm(app); + } spin_unlock(&app->lock); - - mrp_periodic_timer_arm(app); } static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) @@ -872,6 +876,7 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) app->dev = dev; app->app = appl; app->mad = RB_ROOT; + app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); @@ -900,6 +905,9 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) RCU_INIT_POINTER(port->applicants[appl->type], NULL); + spin_lock_bh(&app->lock); + app->active = false; + spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index a313165e7a67..4d2d501991b1 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -407,6 +407,8 @@ int vlan_vids_add_by_dev(struct net_device *dev, return 0; list_for_each_entry(vid_info, &vlan_info->vid_list, list) { + if (!vlan_hw_filter_capable(by_dev, vid_info->proto)) + continue; err = vlan_vid_add(dev, vid_info->proto, vid_info->vid); if (err) goto unwind; @@ -417,6 +419,8 @@ unwind: list_for_each_entry_continue_reverse(vid_info, &vlan_info->vid_list, list) { + if (!vlan_hw_filter_capable(by_dev, vid_info->proto)) + continue; vlan_vid_del(dev, vid_info->proto, vid_info->vid); } @@ -436,8 +440,11 @@ void vlan_vids_del_by_dev(struct net_device *dev, if (!vlan_info) return; - list_for_each_entry(vid_info, &vlan_info->vid_list, list) + list_for_each_entry(vid_info, &vlan_info->vid_list, list) { + if (!vlan_hw_filter_capable(by_dev, vid_info->proto)) + continue; vlan_vid_del(dev, vid_info->proto, vid_info->vid); + } } EXPORT_SYMBOL(vlan_vids_del_by_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 589615ec490b..0a3a16791621 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -109,8 +109,8 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ - if (veth->h_vlan_proto != vlan->vlan_proto || - vlan->flags & VLAN_FLAG_REORDER_HDR) { + if (vlan->flags & VLAN_FLAG_REORDER_HDR || + veth->h_vlan_proto != vlan->vlan_proto) { u16 vlan_tci; vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); @@ -366,7 +366,7 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCSHWTSTAMP: - if (!net_eq(dev_net(dev), &init_net)) + if (!net_eq(dev_net(dev), dev_net(real_dev))) break; /* fall through */ case SIOCGMIIPHY: diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 0db85aeb119b..99b277775257 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -118,12 +118,16 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { + if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) + continue; m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { + if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) + continue; m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) diff --git a/net/9p/client.c b/net/9p/client.c index b03fce66eb8d..9368a1c0196d 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -893,16 +893,13 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) struct p9_fid *fid; p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); - fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL); + fid = kzalloc(sizeof(struct p9_fid), GFP_KERNEL); if (!fid) return NULL; - memset(&fid->qid, 0, sizeof(struct p9_qid)); fid->mode = -1; fid->uid = current_fsuid(); fid->clnt = clnt; - fid->rdir = NULL; - fid->fid = 0; idr_preload(GFP_KERNEL); spin_lock_irq(&clnt->lock); diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 03593eb240d8..ef0a82dbbe04 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -228,6 +228,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, uint16_t *nwname = va_arg(ap, uint16_t *); char ***wnames = va_arg(ap, char ***); + *wnames = NULL; + errcode = p9pdu_readf(pdu, proto_version, "w", nwname); if (!errcode) { @@ -237,6 +239,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, GFP_NOFS); if (!*wnames) errcode = -ENOMEM; + else + (*wnames)[0] = NULL; } if (!errcode) { @@ -248,8 +252,10 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, proto_version, "s", &(*wnames)[i]); - if (errcode) + if (errcode) { + (*wnames)[i] = NULL; break; + } } } @@ -257,11 +263,14 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (*wnames) { int i; - for (i = 0; i < *nwname; i++) + for (i = 0; i < *nwname; i++) { + if (!(*wnames)[i]) + break; kfree((*wnames)[i]); + } + kfree(*wnames); + *wnames = NULL; } - kfree(*wnames); - *wnames = NULL; } } break; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 60eb9a2b209b..872568cca926 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -118,7 +118,7 @@ struct p9_conn { struct list_head unsent_req_list; struct p9_req_t *rreq; struct p9_req_t *wreq; - char tmp_buf[7]; + char tmp_buf[P9_HDRSZ]; struct p9_fcall rc; int wpos; int wsize; @@ -200,11 +200,15 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { list_move(&req->req_list, &cancel_list); + req->status = REQ_STATUS_ERROR; } list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { list_move(&req->req_list, &cancel_list); + req->status = REQ_STATUS_ERROR; } + spin_unlock(&m->client->lock); + list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); list_del(&req->req_list); @@ -212,7 +216,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err) req->t_err = err; p9_client_cb(m->client, req, REQ_STATUS_ERROR); } - spin_unlock(&m->client->lock); } static __poll_t @@ -288,7 +291,7 @@ static void p9_read_work(struct work_struct *work) if (!m->rc.sdata) { m->rc.sdata = m->tmp_buf; m->rc.offset = 0; - m->rc.capacity = 7; /* start by reading header */ + m->rc.capacity = P9_HDRSZ; /* start by reading header */ } clear_bit(Rpending, &m->wsched); @@ -311,7 +314,7 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "got new header\n"); /* Header size */ - m->rc.size = 7; + m->rc.size = P9_HDRSZ; err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0); if (err) { p9_debug(P9_DEBUG_ERROR, @@ -820,11 +823,14 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd) goto out_free_ts; if (!(ts->rd->f_mode & FMODE_READ)) goto out_put_rd; + /* prevent workers from hanging on IO when fd is a pipe */ + ts->rd->f_flags |= O_NONBLOCK; ts->wr = fget(wfd); if (!ts->wr) goto out_put_rd; if (!(ts->wr->f_mode & FMODE_WRITE)) goto out_put_wr; + ts->wr->f_flags |= O_NONBLOCK; client->trans = ts; client->status = Connected; @@ -846,8 +852,10 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) struct file *file; p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); - if (!p) + if (!p) { + sock_release(csocket); return -ENOMEM; + } csocket->sk->sk_allocation = GFP_NOIO; file = sock_alloc_file(csocket, 0, NULL); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b21c3c209815..c00e965c082b 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -385,6 +385,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) struct p9_trans_rdma *rdma = client->trans; struct ib_recv_wr wr; struct ib_sge sge; + int ret; c->busa = ib_dma_map_single(rdma->cm_id->device, c->rc.sdata, client->msize, @@ -402,7 +403,12 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; - return ib_post_recv(rdma->qp, &wr, NULL); + + ret = ib_post_recv(rdma->qp, &wr, NULL); + if (ret) + ib_dma_unmap_single(rdma->cm_id->device, c->busa, + client->msize, DMA_FROM_DEVICE); + return ret; error: p9_debug(P9_DEBUG_ERROR, "EIO\n"); @@ -499,7 +505,7 @@ dont_need_post_recv: if (down_interruptible(&rdma->sq_sem)) { err = -EINTR; - goto send_error; + goto dma_unmap; } /* Mark request as `sent' *before* we actually send it, @@ -509,11 +515,14 @@ dont_need_post_recv: req->status = REQ_STATUS_SENT; err = ib_post_send(rdma->qp, &wr, NULL); if (err) - goto send_error; + goto dma_unmap; /* Success */ return 0; +dma_unmap: + ib_dma_unmap_single(rdma->cm_id->device, c->busa, + c->req->tc.size, DMA_TO_DEVICE); /* Handle errors that happened during or while preparing the send: */ send_error: req->status = REQ_STATUS_ERROR; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index f582351d84ec..36b5f72e2165 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -394,7 +394,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct page **in_pages = NULL, **out_pages = NULL; struct virtio_chan *chan = client->trans; struct scatterlist *sgs[4]; - size_t offs; + size_t offs = 0; int need_drop = 0; int kicked = 0; diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 2779ec1053a0..4a16c5dd1576 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -230,6 +230,14 @@ static void p9_xen_response(struct work_struct *work) continue; } + if (h.size > req->rc.capacity) { + dev_warn(&priv->dev->dev, + "requested packet size too big: %d for tag %d with capacity %zd\n", + h.size, h.tag, req->rc.capacity); + req->status = REQ_STATUS_ERROR; + goto recv_error; + } + memcpy(&req->rc, &h, sizeof(h)); req->rc.offset = 0; @@ -239,6 +247,7 @@ static void p9_xen_response(struct work_struct *work) masked_prod, &masked_cons, XEN_9PFS_RING_SIZE); +recv_error: virt_mb(); cons += h.size; ring->intf->in_cons = cons; @@ -290,6 +299,10 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) write_unlock(&xen_9pfs_lock); for (i = 0; i < priv->num_rings; i++) { + struct xen_9pfs_dataring *ring = &priv->rings[i]; + + cancel_work_sync(&ring->work); + if (!priv->rings[i].intf) break; if (priv->rings[i].irq > 0) @@ -380,19 +393,24 @@ out: return ret; } -static int xen_9pfs_front_probe(struct xenbus_device *dev, - const struct xenbus_device_id *id) +static int xen_9pfs_front_init(struct xenbus_device *dev) { int ret, i; struct xenbus_transaction xbt; - struct xen_9pfs_front_priv *priv = NULL; - char *versions; + struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + char *versions, *v; unsigned int max_rings, max_ring_order, len = 0; versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); if (IS_ERR(versions)) return PTR_ERR(versions); - if (strcmp(versions, "1")) { + for (v = versions; *v; v++) { + if (simple_strtoul(v, &v, 10) == 1) { + v = NULL; + break; + } + } + if (v) { kfree(versions); return -EINVAL; } @@ -405,11 +423,6 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, if (max_ring_order < XEN_9PFS_RING_ORDER) return -EINVAL; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; - - priv->dev = dev; priv->num_rings = XEN_9PFS_NUM_RINGS; priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings), GFP_KERNEL); @@ -467,23 +480,35 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, goto error; } - write_lock(&xen_9pfs_lock); - list_add_tail(&priv->list, &xen_9pfs_devs); - write_unlock(&xen_9pfs_lock); - dev_set_drvdata(&dev->dev, priv); - xenbus_switch_state(dev, XenbusStateInitialised); - return 0; error_xenbus: xenbus_transaction_end(xbt, 1); xenbus_dev_fatal(dev, ret, "writing xenstore"); error: - dev_set_drvdata(&dev->dev, NULL); xen_9pfs_front_free(priv); return ret; } +static int xen_9pfs_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct xen_9pfs_front_priv *priv = NULL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + dev_set_drvdata(&dev->dev, priv); + + write_lock(&xen_9pfs_lock); + list_add_tail(&priv->list, &xen_9pfs_devs); + write_unlock(&xen_9pfs_lock); + + return 0; +} + static int xen_9pfs_front_resume(struct xenbus_device *dev) { dev_warn(&dev->dev, "suspend/resume unsupported\n"); @@ -502,6 +527,8 @@ static void xen_9pfs_front_changed(struct xenbus_device *dev, break; case XenbusStateInitWait: + if (!xen_9pfs_front_init(dev)) + xenbus_switch_state(dev, XenbusStateInitialised); break; case XenbusStateConnected: diff --git a/net/Kconfig b/net/Kconfig index 0b2fecc83452..48ed37cdd22f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -203,7 +203,6 @@ config BRIDGE_NETFILTER source "net/netfilter/Kconfig" source "net/ipv4/netfilter/Kconfig" source "net/ipv6/netfilter/Kconfig" -source "net/decnet/netfilter/Kconfig" source "net/bridge/netfilter/Kconfig" endif @@ -220,7 +219,6 @@ source "net/802/Kconfig" source "net/bridge/Kconfig" source "net/dsa/Kconfig" source "net/8021q/Kconfig" -source "net/decnet/Kconfig" source "net/llc/Kconfig" source "drivers/net/appletalk/Kconfig" source "net/x25/Kconfig" diff --git a/net/Makefile b/net/Makefile index 449fc0b221f8..177b6fbac29c 100644 --- a/net/Makefile +++ b/net/Makefile @@ -39,7 +39,6 @@ obj-$(CONFIG_AF_KCM) += kcm/ obj-$(CONFIG_STREAM_PARSER) += strparser/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_L2TP) += l2tp/ -obj-$(CONFIG_DECNET) += decnet/ obj-$(CONFIG_PHONET) += phonet/ ifneq ($(CONFIG_VLAN_8021Q),) obj-y += 8021q/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 4610c352849b..70cd5f55628d 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1803,15 +1803,14 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } case TIOCINQ: { - /* - * These two are safe on a single CPU system as only - * user tasks fiddle here - */ - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + struct sk_buff *skb; long amount = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len - sizeof(struct ddpehdr); + spin_unlock_irq(&sk->sk_receive_queue.lock); rc = put_user(amount, (int __user *)argp); break; } diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index d955b683aa7c..3a2198aabc36 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -71,14 +71,17 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, case SIOCINQ: { struct sk_buff *skb; + int amount; if (sock->state != SS_CONNECTED) { error = -EINVAL; goto done; } + spin_lock_irq(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); - error = put_user(skb ? skb->len : 0, - (int __user *)argp) ? -EFAULT : 0; + amount = skb ? skb->len : 0; + spin_unlock_irq(&sk->sk_receive_queue.lock); + error = put_user(amount, (int __user *)argp) ? -EFAULT : 0; goto done; } case ATM_SETSC: diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 46d6cd9a36ae..c4e9538ac144 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -222,11 +222,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff, if (!page) return -ENOMEM; - for (p = page, len = 0; len < nbytes; p++, len++) { + for (p = page, len = 0; len < nbytes; p++) { if (get_user(*p, buff++)) { free_page((unsigned long)page); return -EFAULT; } + len += 1; if (*p == '\0' || *p == '\n') break; } diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 45d8e1d5d033..579b66da1d95 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -393,7 +393,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) * Each PPPoATM instance has its own tasklet - this is just a * prototypical one used to initialize them */ - static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0); + static const DECLARE_TASKLET_OLD(tasklet_proto, pppoatm_wakeup_sender); if (copy_from_user(&be, arg, sizeof be)) return -EFAULT; if (be.encaps != PPPOATM_ENCAPS_AUTODETECT && diff --git a/net/atm/resources.c b/net/atm/resources.c index 889349c6d90d..04b2235c5c26 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -443,6 +443,7 @@ done: return error; } +#ifdef CONFIG_PROC_FS void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos) { mutex_lock(&atm_dev_mutex); @@ -458,3 +459,4 @@ void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &atm_devs, pos); } +#endif diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index a39af0eefad3..aae73f94b2c8 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -501,7 +501,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_elp_packet *elp_packet; struct batadv_hard_iface *primary_if; - struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + struct ethhdr *ethhdr; bool res; int ret = NET_RX_DROP; @@ -509,6 +509,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb, if (!res) goto free_skb; + ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 3165f6ff8ee7..f13a779b8656 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -122,8 +122,10 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - if (hard_iface->if_status != BATADV_IF_ACTIVE) + if (hard_iface->if_status != BATADV_IF_ACTIVE) { + kfree_skb(skb); return; + } batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, @@ -994,7 +996,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm2_packet *ogm_packet; - struct ethhdr *ethhdr = eth_hdr(skb); + struct ethhdr *ethhdr; int ogm_offset; u8 *packet_pos; int ret = NET_RX_DROP; @@ -1008,6 +1010,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) goto free_skb; + ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index ec7bf5a4a9fc..dda62dcd59c8 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -102,7 +102,6 @@ static void batadv_dat_purge(struct work_struct *work); */ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) { - INIT_DELAYED_WORK(&bat_priv->dat.work, batadv_dat_purge); queue_delayed_work(batadv_event_workqueue, &bat_priv->dat.work, msecs_to_jiffies(10000)); } @@ -817,6 +816,7 @@ int batadv_dat_init(struct batadv_priv *bat_priv) if (!bat_priv->dat.hash) return -ENOMEM; + INIT_DELAYED_WORK(&bat_priv->dat.work, batadv_dat_purge); batadv_dat_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_dat_tvlv_ogm_handler_v1, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 5f44c94ad707..073019f2e451 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -632,7 +632,19 @@ out: */ void batadv_update_min_mtu(struct net_device *soft_iface) { - soft_iface->mtu = batadv_hardif_min_mtu(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(soft_iface); + int limit_mtu; + int mtu; + + mtu = batadv_hardif_min_mtu(soft_iface); + + if (bat_priv->mtu_set_by_user) + limit_mtu = bat_priv->mtu_set_by_user; + else + limit_mtu = ETH_DATA_LEN; + + mtu = min(mtu, limit_mtu); + dev_set_mtu(soft_iface, mtu); /* Check if the local translate table should be cleaned up to match a * new (and smaller) MTU. diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index e59c5aa27ee0..f3e102a1201b 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -496,7 +496,10 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]; atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr)); + + rtnl_lock(); batadv_update_min_mtu(bat_priv->soft_iface); + rtnl_unlock(); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) { diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 504e3cb67bed..bd06d5b5314e 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -156,11 +156,14 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { + struct batadv_priv *bat_priv = netdev_priv(dev); + /* check ranges */ if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; + bat_priv->mtu_set_by_user = new_mtu; return 0; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 515205d7b650..38930eccd9df 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -780,7 +780,6 @@ check_roaming: if (roamed_back) { batadv_tt_global_free(bat_priv, tt_global, "Roaming canceled"); - tt_global = NULL; } else { /* The global entry has to be marked as ROAMING and * has to be kept for consistency purpose @@ -4191,7 +4190,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) spin_lock_bh(&bat_priv->tt.commit_lock); - while (true) { + while (timeout) { table_size = batadv_tt_local_table_transmit_size(bat_priv); if (packet_size_max >= table_size) break; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 4d7f1baee7b7..9fdf1be9b99b 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1564,6 +1564,12 @@ struct batadv_priv { struct net_device *soft_iface; /** + * @mtu_set_by_user: MTU was set once by user + * protected by rtnl_lock + */ + int mtu_set_by_user; + + /** * @bat_counters: mesh internal traffic statistic counters (see * batadv_counters) */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 52fb6d6d6d58..bccad8c048da 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -1002,6 +1002,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); hci_dev_unlock(hdev); + hci_dev_put(hdev); if (!hcon) return -ENOENT; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 5f508c50649d..8031526eeeee 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -735,7 +735,7 @@ static int __init bt_init(void) err = bt_sysfs_init(); if (err < 0) - return err; + goto cleanup_led; err = sock_register(&bt_sock_family_ops); if (err) @@ -771,6 +771,8 @@ unregister_socket: sock_unregister(PF_BLUETOOTH); cleanup_sysfs: bt_sysfs_cleanup(); +cleanup_led: + bt_leds_cleanup(); return err; } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ee57fa20bac3..d55973cb5b54 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -125,13 +125,11 @@ static void hci_conn_cleanup(struct hci_conn *conn) if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); - hci_conn_del_sysfs(conn); - debugfs_remove_recursive(conn->debugfs); - hci_dev_put(hdev); + hci_conn_del_sysfs(conn); - hci_conn_put(conn); + hci_dev_put(hdev); } static void le_scan_cleanup(struct work_struct *work) @@ -1207,6 +1205,15 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EOPNOTSUPP); } + /* Reject outgoing connection to device with same BD ADDR against + * CVE-2020-26555 + */ + if (!bacmp(&hdev->bdaddr, dst)) { + bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n", + dst); + return ERR_PTR(-ECONNREFUSED); + } + acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { acl = hci_conn_add(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); @@ -1334,12 +1341,10 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); - /* If we're already encrypted set the REAUTH_PEND flag, - * otherwise set the ENCRYPT_PEND. + /* Set the ENCRYPT_PEND to trigger encryption after + * authentication. */ - if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) - set_bit(HCI_CONN_REAUTH_PEND, &conn->flags); - else + if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); } @@ -1382,34 +1387,41 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, if (!test_bit(HCI_CONN_AUTH, &conn->flags)) goto auth; - /* An authenticated FIPS approved combination key has sufficient - * security for security level 4. */ - if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256 && - sec_level == BT_SECURITY_FIPS) - goto encrypt; - - /* An authenticated combination key has sufficient security for - security level 3. */ - if ((conn->key_type == HCI_LK_AUTH_COMBINATION_P192 || - conn->key_type == HCI_LK_AUTH_COMBINATION_P256) && - sec_level == BT_SECURITY_HIGH) - goto encrypt; - - /* An unauthenticated combination key has sufficient security for - security level 1 and 2. */ - if ((conn->key_type == HCI_LK_UNAUTH_COMBINATION_P192 || - conn->key_type == HCI_LK_UNAUTH_COMBINATION_P256) && - (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW)) - goto encrypt; - - /* A combination key has always sufficient security for the security - levels 1 or 2. High security level requires the combination key - is generated using maximum PIN code length (16). - For pre 2.1 units. */ - if (conn->key_type == HCI_LK_COMBINATION && - (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW || - conn->pin_length == 16)) - goto encrypt; + switch (conn->key_type) { + case HCI_LK_AUTH_COMBINATION_P256: + /* An authenticated FIPS approved combination key has + * sufficient security for security level 4 or lower. + */ + if (sec_level <= BT_SECURITY_FIPS) + goto encrypt; + break; + case HCI_LK_AUTH_COMBINATION_P192: + /* An authenticated combination key has sufficient security for + * security level 3 or lower. + */ + if (sec_level <= BT_SECURITY_HIGH) + goto encrypt; + break; + case HCI_LK_UNAUTH_COMBINATION_P192: + case HCI_LK_UNAUTH_COMBINATION_P256: + /* An unauthenticated combination key has sufficient security + * for security level 2 or lower. + */ + if (sec_level <= BT_SECURITY_MEDIUM) + goto encrypt; + break; + case HCI_LK_COMBINATION: + /* A combination key has always sufficient security for the + * security levels 2 or lower. High security level requires the + * combination key is generated using maximum PIN code length + * (16). For pre 2.1 units. + */ + if (sec_level <= BT_SECURITY_MEDIUM || conn->pin_length == 16) + goto encrypt; + break; + default: + break; + } auth: if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2ebb6480b6ec..c60204b639ab 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1582,6 +1582,7 @@ setup_failed: hdev->flush(hdev); if (hdev->sent_cmd) { + cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } @@ -2128,7 +2129,7 @@ int hci_get_dev_info(void __user *arg) else flags = hdev->flags; - strcpy(di.name, hdev->name); + strscpy(di.name, hdev->name, sizeof(di.name)); di.bdaddr = hdev->bdaddr; di.type = (hdev->bus & 0x0f) | ((hdev->dev_type & 0x03) << 4); di.flags = flags; @@ -2271,6 +2272,7 @@ static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); + hci_dev_hold(hdev); BT_DBG("%s", hdev->name); if (hdev->hw_error) @@ -2278,10 +2280,10 @@ static void hci_error_reset(struct work_struct *work) else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); - if (hci_dev_do_close(hdev)) - return; + if (!hci_dev_do_close(hdev)) + hci_dev_do_open(hdev); - hci_dev_do_open(hdev); + hci_dev_put(hdev); } void hci_uuids_clear(struct hci_dev *hdev) @@ -2571,10 +2573,10 @@ int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { - struct smp_ltk *k; + struct smp_ltk *k, *tmp; int removed = 0; - list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type) continue; @@ -2590,9 +2592,9 @@ int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { - struct smp_irk *k; + struct smp_irk *k, *tmp; - list_for_each_entry_rcu(k, &hdev->identity_resolving_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type) continue; @@ -3316,7 +3318,11 @@ int hci_register_dev(struct hci_dev *hdev) if (id < 0) return id; - snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); + error = dev_set_name(&hdev->dev, "hci%u", id); + if (error) + return error; + + hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); @@ -3338,8 +3344,6 @@ int hci_register_dev(struct hci_dev *hdev) if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); - dev_set_name(&hdev->dev, "%s", hdev->name); - error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; @@ -4455,7 +4459,7 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; - kfree_skb(skb); + dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 402e2cc54044..7618b33daca9 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -200,10 +200,12 @@ static int conn_info_min_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val > hdev->conn_info_max_age) + hci_dev_lock(hdev); + if (val == 0 || val > hdev->conn_info_max_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_min_age = val; hci_dev_unlock(hdev); @@ -228,10 +230,12 @@ static int conn_info_max_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val < hdev->conn_info_min_age) + hci_dev_lock(hdev); + if (val == 0 || val < hdev->conn_info_min_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_max_age = val; hci_dev_unlock(hdev); @@ -508,10 +512,12 @@ static int sniff_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val > hdev->sniff_max_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val > hdev->sniff_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_min_interval = val; hci_dev_unlock(hdev); @@ -536,10 +542,12 @@ static int sniff_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val < hdev->sniff_min_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val < hdev->sniff_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_max_interval = val; hci_dev_unlock(hdev); @@ -780,10 +788,12 @@ static int conn_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_min_interval = val; hci_dev_unlock(hdev); @@ -808,10 +818,12 @@ static int conn_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_max_interval = val; hci_dev_unlock(hdev); @@ -920,10 +932,12 @@ static int adv_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_min_interval = val; hci_dev_unlock(hdev); @@ -948,10 +962,12 @@ static int adv_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_max_interval = val; hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ff6625493c9f..c00872f0dd2b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -25,6 +25,8 @@ /* Bluetooth HCI event handling. */ #include <asm/unaligned.h> +#include <linux/crypto.h> +#include <crypto/algapi.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> @@ -1783,7 +1785,8 @@ static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status) return; } - set_bit(HCI_INQUIRY, &hdev->flags); + if (hci_sent_cmd_data(hdev, HCI_OP_INQUIRY)) + set_bit(HCI_INQUIRY, &hdev->flags); } static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) @@ -2534,6 +2537,31 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (test_bit(HCI_ENCRYPT, &hdev->flags)) set_bit(HCI_CONN_ENCRYPT, &conn->flags); + /* "Link key request" completed ahead of "connect request" completes */ + if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) && + ev->link_type == ACL_LINK) { + struct link_key *key; + struct hci_cp_read_enc_key_size cp; + + key = hci_find_link_key(hdev, &ev->bdaddr); + if (key) { + set_bit(HCI_CONN_ENCRYPT, &conn->flags); + + if (!(hdev->commands[20] & 0x10)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + cp.handle = cpu_to_le16(conn->handle); + if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, + sizeof(cp), &cp)) { + bt_dev_err(hdev, "sending read key size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } + } + + hci_encrypt_cfm(conn, ev->status); + } + } + /* Get remote features */ if (conn->type == ACL_LINK) { struct hci_cp_read_remote_features cp; @@ -2594,6 +2622,16 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s bdaddr %pMR type 0x%x", hdev->name, &ev->bdaddr, ev->link_type); + /* Reject incoming connection from device with same BD ADDR against + * CVE-2020-26555 + */ + if (hdev && !bacmp(&hdev->bdaddr, &ev->bdaddr)) { + bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n", + &ev->bdaddr); + hci_reject_conn(hdev, &ev->bdaddr); + return; + } + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type, &flags); @@ -2793,14 +2831,8 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!ev->status) { clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); - - if (!hci_conn_ssp_enabled(conn) && - test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) { - bt_dev_info(hdev, "re-auth of legacy device is not possible."); - } else { - set_bit(HCI_CONN_AUTH, &conn->flags); - conn->sec_level = conn->pending_sec_level; - } + set_bit(HCI_CONN_AUTH, &conn->flags); + conn->sec_level = conn->pending_sec_level; } else { if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); @@ -2809,7 +2841,6 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) } clear_bit(HCI_CONN_AUTH_PEND, &conn->flags); - clear_bit(HCI_CONN_REAUTH_PEND, &conn->flags); if (conn->state == BT_CONFIG) { if (!ev->status && hci_conn_ssp_enabled(conn)) { @@ -2855,8 +2886,6 @@ static void hci_remote_name_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s", hdev->name); - hci_conn_check_pending(hdev); - hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); @@ -3935,6 +3964,15 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!conn) goto unlock; + /* Ignore NULL link key against CVE-2020-26555 */ + if (!crypto_memneq(ev->link_key, ZERO_KEY, HCI_LINK_KEY_SIZE)) { + bt_dev_dbg(hdev, "Ignore NULL link key (ZERO KEY) for %pMR", + &ev->bdaddr); + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); + goto unlock; + } + hci_conn_hold(conn); conn->disc_timeout = HCI_DISCONN_TIMEOUT; hci_conn_drop(conn); @@ -4177,6 +4215,19 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct hci_ev_sync_conn_complete *ev = (void *) skb->data; struct hci_conn *conn; + switch (ev->link_type) { + case SCO_LINK: + case ESCO_LINK: + break; + default: + /* As per Core 5.3 Vol 4 Part E 7.7.35 (p.2219), Link_Type + * for HCI_Synchronous_Connection_Complete is limited to + * either SCO or eSCO + */ + bt_dev_err(hdev, "Ignoring connect complete event for invalid link type"); + return; + } + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); hci_dev_lock(hdev); @@ -4413,8 +4464,8 @@ static u8 bredr_oob_data_present(struct hci_conn *conn) * available, then do not declare that OOB data is * present. */ - if (!memcmp(data->rand256, ZERO_KEY, 16) || - !memcmp(data->hash256, ZERO_KEY, 16)) + if (!crypto_memneq(data->rand256, ZERO_KEY, 16) || + !crypto_memneq(data->hash256, ZERO_KEY, 16)) return 0x00; return 0x02; @@ -4424,8 +4475,8 @@ static u8 bredr_oob_data_present(struct hci_conn *conn) * not supported by the hardware, then check that if * P-192 data values are present. */ - if (!memcmp(data->rand192, ZERO_KEY, 16) || - !memcmp(data->hash192, ZERO_KEY, 16)) + if (!crypto_memneq(data->rand192, ZERO_KEY, 16) || + !crypto_memneq(data->hash192, ZERO_KEY, 16)) return 0x00; return 0x01; @@ -4441,9 +4492,12 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); - if (!conn) + if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) goto unlock; + /* Assume remote supports SSP since it has triggered this event */ + set_bit(HCI_CONN_SSP_ENABLED, &conn->flags); + hci_conn_hold(conn); if (!hci_dev_test_flag(hdev, HCI_MGMT)) @@ -4676,7 +4730,7 @@ static void hci_simple_pair_complete_evt(struct hci_dev *hdev, hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); - if (!conn) + if (!conn || !hci_conn_ssp_enabled(conn)) goto unlock; /* Reset the authentication requirement to unknown */ @@ -5738,6 +5792,10 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_UNKNOWN_CONN_ID); + if (max > hcon->le_conn_max_interval) + return send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + if (hci_check_conn_params(min, max, latency, timeout)) return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_INVALID_LL_PARAMS); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 7f3f4ea56d44..9da32a120231 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -107,8 +107,10 @@ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; - if (skb) + if (skb) { + kfree_skb(hdev->req_skb); hdev->req_skb = skb_get(skb); + } wake_up_interruptible(&hdev->req_wait_q); } } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index befab857a39b..1d08f7b78fa0 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -430,7 +430,8 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); - memcpy(ni->name, hdev->name, 8); + memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name, + strnlen(hdev->name, sizeof(ni->name)), '\0'); opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; @@ -881,10 +882,6 @@ static int hci_sock_release(struct socket *sock) } sock_orphan(sk); - - skb_queue_purge(&sk->sk_receive_queue); - skb_queue_purge(&sk->sk_write_queue); - release_sock(sk); sock_put(sk); return 0; @@ -977,6 +974,34 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, BT_DBG("cmd %x arg %lx", cmd, arg); + /* Make sure the cmd is valid before doing anything */ + switch (cmd) { + case HCIGETDEVLIST: + case HCIGETDEVINFO: + case HCIGETCONNLIST: + case HCIDEVUP: + case HCIDEVDOWN: + case HCIDEVRESET: + case HCIDEVRESTAT: + case HCISETSCAN: + case HCISETAUTH: + case HCISETENCRYPT: + case HCISETPTYPE: + case HCISETLINKPOL: + case HCISETLINKMODE: + case HCISETACLMTU: + case HCISETSCOMTU: + case HCIINQUIRY: + case HCISETRAW: + case HCIGETCONNINFO: + case HCIGETAUTHINFO: + case HCIBLOCKADDR: + case HCIUNBLOCKADDR: + break; + default: + return -ENOIOCTLCMD; + } + lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { @@ -993,7 +1018,14 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, if (hci_sock_gen_cookie(sk)) { struct sk_buff *skb; - if (capable(CAP_NET_ADMIN)) + /* Perform careful checks before setting the HCI_SOCK_TRUSTED + * flag. Make sure that not only the current task but also + * the socket opener has the required capability, since + * privileged programs can be tricked into making ioctl calls + * on HCI sockets, and the socket should not be marked as + * trusted simply because the ioctl caller is privileged. + */ + if (sk_capable(sk, CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); /* Send event to monitor */ @@ -1985,6 +2017,12 @@ done: return err; } +static void hci_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); +} + static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, @@ -2035,6 +2073,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol, sock->state = SS_UNCONNECTED; sk->sk_state = BT_OPEN; + sk->sk_destruct = hci_sock_destruct; bt_sock_link(&hci_sk_list, sk); return 0; diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index b69d88b88d2e..266112c960ee 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -33,7 +33,7 @@ void hci_conn_init_sysfs(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); conn->dev.type = &bt_link; conn->dev.class = bt_class; @@ -46,24 +46,30 @@ void hci_conn_add_sysfs(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); + + if (device_is_registered(&conn->dev)) + return; dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); - if (device_add(&conn->dev) < 0) { + if (device_add(&conn->dev) < 0) bt_dev_err(hdev, "failed to register connection device"); - return; - } - - hci_dev_hold(hdev); } void hci_conn_del_sysfs(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; - if (!device_is_registered(&conn->dev)) + bt_dev_dbg(hdev, "conn %p", conn); + + if (!device_is_registered(&conn->dev)) { + /* If device_add() has *not* succeeded, use *only* put_device() + * to drop the reference count. + */ + put_device(&conn->dev); return; + } while (1) { struct device *dev; @@ -75,9 +81,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn) put_device(dev); } - device_del(&conn->dev); - - hci_dev_put(hdev); + device_unregister(&conn->dev); } static void bt_host_release(struct device *dev) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index ac98e3b37ab4..a94ec229ad2e 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -433,7 +433,7 @@ static void hidp_set_timer(struct hidp_session *session) static void hidp_del_timer(struct hidp_session *session) { if (session->idle_to > 0) - del_timer(&session->timer); + del_timer_sync(&session->timer); } static void hidp_process_report(struct hidp_session *session, int type, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 959a16b13303..9c06f5ffd1b5 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -60,6 +60,9 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err); static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff_head *skbs, u8 event); +static void l2cap_retrans_timeout(struct work_struct *work); +static void l2cap_monitor_timeout(struct work_struct *work); +static void l2cap_ack_timeout(struct work_struct *work); static inline u8 bdaddr_type(u8 link_type, u8 bdaddr_type) { @@ -110,7 +113,8 @@ static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn, } /* Find channel with given SCID. - * Returns locked channel. */ + * Returns a reference locked channel. + */ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, u16 cid) { @@ -118,15 +122,19 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_scid(conn, cid); - if (c) - l2cap_chan_lock(c); + if (c) { + /* Only lock if chan reference is not 0 */ + c = l2cap_chan_hold_unless_zero(c); + if (c) + l2cap_chan_lock(c); + } mutex_unlock(&conn->chan_lock); return c; } /* Find channel with given DCID. - * Returns locked channel. + * Returns a reference locked channel. */ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, u16 cid) @@ -135,8 +143,12 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_dcid(conn, cid); - if (c) - l2cap_chan_lock(c); + if (c) { + /* Only lock if chan reference is not 0 */ + c = l2cap_chan_hold_unless_zero(c); + if (c) + l2cap_chan_lock(c); + } mutex_unlock(&conn->chan_lock); return c; @@ -161,8 +173,12 @@ static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn, mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_ident(conn, ident); - if (c) - l2cap_chan_lock(c); + if (c) { + /* Only lock if chan reference is not 0 */ + c = l2cap_chan_hold_unless_zero(c); + if (c) + l2cap_chan_lock(c); + } mutex_unlock(&conn->chan_lock); return c; @@ -462,6 +478,9 @@ struct l2cap_chan *l2cap_chan_create(void) write_unlock(&chan_list_lock); INIT_DELAYED_WORK(&chan->chan_timer, l2cap_chan_timeout); + INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout); + INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout); + INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout); chan->state = BT_OPEN; @@ -496,6 +515,16 @@ void l2cap_chan_hold(struct l2cap_chan *c) kref_get(&c->kref); } +struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c) +{ + BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref)); + + if (!kref_get_unless_zero(&c->kref)) + return NULL; + + return c; +} + void l2cap_chan_put(struct l2cap_chan *c) { BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref)); @@ -1790,11 +1819,11 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr_t *dst, u8 link_type) { - struct l2cap_chan *c, *c1 = NULL; + struct l2cap_chan *c, *tmp, *c1 = NULL; read_lock(&chan_list_lock); - list_for_each_entry(c, &chan_list, global_l) { + list_for_each_entry_safe(c, tmp, &chan_list, global_l) { if (state && c->state != state) continue; @@ -1804,7 +1833,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, if (link_type == LE_LINK && c->src_type == BDADDR_BREDR) continue; - if (c->psm == psm) { + if (c->chan_type != L2CAP_CHAN_FIXED && c->psm == psm) { int src_match, dst_match; int src_any, dst_any; @@ -1812,7 +1841,9 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, src_match = !bacmp(&c->src, src); dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { - l2cap_chan_hold(c); + if (!l2cap_chan_hold_unless_zero(c)) + continue; + read_unlock(&chan_list_lock); return c; } @@ -1827,7 +1858,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, } if (c1) - l2cap_chan_hold(c1); + c1 = l2cap_chan_hold_unless_zero(c1); read_unlock(&chan_list_lock); @@ -2495,14 +2526,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); - /* Channel lock is released before requesting new skb and then - * reacquired thus we need to recheck channel state. - */ - if (chan->state != BT_CONNECTED) { - kfree_skb(skb); - return -ENOTCONN; - } - l2cap_do_send(chan, skb); return len; } @@ -2546,14 +2569,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); - /* Channel lock is released before requesting new skb and then - * reacquired thus we need to recheck channel state. - */ - if (chan->state != BT_CONNECTED) { - kfree_skb(skb); - return -ENOTCONN; - } - l2cap_do_send(chan, skb); err = len; break; @@ -2574,14 +2589,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) */ err = l2cap_segment_sdu(chan, &seg_queue, msg, len); - /* The channel could have been closed while segmenting, - * check that it is still connected. - */ - if (chan->state != BT_CONNECTED) { - __skb_queue_purge(&seg_queue); - err = -ENOTCONN; - } - if (err) break; @@ -3138,10 +3145,6 @@ int l2cap_ertm_init(struct l2cap_chan *chan) chan->rx_state = L2CAP_RX_STATE_RECV; chan->tx_state = L2CAP_TX_STATE_XMIT; - INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout); - INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout); - INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout); - skb_queue_head_init(&chan->srej_q); err = l2cap_seq_list_init(&chan->srej_list, chan->tx_win); @@ -3533,7 +3536,8 @@ done: l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), (unsigned long) &rfc, endptr - ptr); - if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) { + if (remote_efs && + test_bit(FLAG_EFS_ENABLE, &chan->flags)) { chan->remote_id = efs.id; chan->remote_stype = efs.stype; chan->remote_msdu = le16_to_cpu(efs.msdu); @@ -4020,6 +4024,10 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, result = __le16_to_cpu(rsp->result); status = __le16_to_cpu(rsp->status); + if (result == L2CAP_CR_SUCCESS && (dcid < L2CAP_CID_DYN_START || + dcid > L2CAP_CID_DYN_END)) + return -EPROTO; + BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); @@ -4039,12 +4047,23 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, } } + chan = l2cap_chan_hold_unless_zero(chan); + if (!chan) { + err = -EBADSLT; + goto unlock; + } + err = 0; l2cap_chan_lock(chan); switch (result) { case L2CAP_CR_SUCCESS: + if (__l2cap_get_chan_by_dcid(conn, dcid)) { + err = -EBADSLT; + break; + } + l2cap_state_change(chan, BT_CONFIG); chan->ident = 0; chan->dcid = dcid; @@ -4068,6 +4087,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, } l2cap_chan_unlock(chan); + l2cap_chan_put(chan); unlock: mutex_unlock(&conn->chan_lock); @@ -4175,7 +4195,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, chan->ident = cmd->ident; l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp); - chan->num_conf_rsp++; + if (chan->num_conf_rsp < L2CAP_CONF_MAX_CONF_RSP) + chan->num_conf_rsp++; /* Reset config buffer. */ chan->conf_len = 0; @@ -4221,6 +4242,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, unlock: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return err; } @@ -4334,6 +4356,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, done: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return err; } @@ -4354,33 +4377,29 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid); - mutex_lock(&conn->chan_lock); - - chan = __l2cap_get_chan_by_scid(conn, dcid); + chan = l2cap_get_chan_by_scid(conn, dcid); if (!chan) { - mutex_unlock(&conn->chan_lock); cmd_reject_invalid_cid(conn, cmd->ident, dcid, scid); return 0; } - l2cap_chan_hold(chan); - l2cap_chan_lock(chan); - rsp.dcid = cpu_to_le16(chan->scid); rsp.scid = cpu_to_le16(chan->dcid); l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp); chan->ops->set_shutdown(chan); + l2cap_chan_unlock(chan); + mutex_lock(&conn->chan_lock); + l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); + mutex_unlock(&conn->chan_lock); chan->ops->close(chan); l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); - return 0; } @@ -4400,33 +4419,28 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid); - mutex_lock(&conn->chan_lock); - - chan = __l2cap_get_chan_by_scid(conn, scid); + chan = l2cap_get_chan_by_scid(conn, scid); if (!chan) { - mutex_unlock(&conn->chan_lock); return 0; } - l2cap_chan_hold(chan); - l2cap_chan_lock(chan); - if (chan->state != BT_DISCONN) { l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); return 0; } + l2cap_chan_unlock(chan); + mutex_lock(&conn->chan_lock); + l2cap_chan_lock(chan); l2cap_chan_del(chan, 0); + mutex_unlock(&conn->chan_lock); chan->ops->close(chan); l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); - return 0; } @@ -5062,6 +5076,7 @@ send_move_response: l2cap_send_move_chan_rsp(chan, result); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -5154,6 +5169,7 @@ static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result) } l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid, @@ -5183,6 +5199,7 @@ static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid, l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } static int l2cap_move_channel_rsp(struct l2cap_conn *conn, @@ -5246,6 +5263,7 @@ static int l2cap_move_channel_confirm(struct l2cap_conn *conn, l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -5281,6 +5299,7 @@ static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn, } l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -5312,7 +5331,13 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - err = hci_check_conn_params(min, max, latency, to_multiplier); + if (max > hcon->le_conn_max_interval) { + BT_DBG("requested connection interval exceeds current bounds."); + err = -EINVAL; + } else { + err = hci_check_conn_params(min, max, latency, to_multiplier); + } + if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else @@ -5529,6 +5554,19 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm), scid, mtu, mps); + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A + * page 1059: + * + * Valid range: 0x0001-0x00ff + * + * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges + */ + if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) { + result = L2CAP_CR_LE_BAD_PSM; + chan = NULL; + goto response; + } + /* Check if we have socket listening on psm */ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, &conn->hcon->dst, LE_LINK); @@ -5653,12 +5691,11 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn, if (credits > max_credits) { BT_ERR("LE credits overflow"); l2cap_send_disconn_req(chan, ECONNRESET); - l2cap_chan_unlock(chan); /* Return 0 so that we don't trigger an unnecessary * command reject packet. */ - return 0; + goto unlock; } chan->tx_credits += credits; @@ -5669,7 +5706,9 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn, if (chan->tx_credits) chan->ops->resume(chan); +unlock: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); return 0; } @@ -5690,9 +5729,14 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, if (!chan) goto done; + chan = l2cap_chan_hold_unless_zero(chan); + if (!chan) + goto done; + l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNREFUSED); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); done: mutex_unlock(&conn->chan_lock); @@ -6231,6 +6275,7 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff *skb, u8 event) { + struct l2cap_ctrl local_control; int err = 0; bool skb_in_use = false; @@ -6255,15 +6300,32 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan, chan->buffer_seq = chan->expected_tx_seq; skb_in_use = true; + /* l2cap_reassemble_sdu may free skb, hence invalidate + * control, so make a copy in advance to use it after + * l2cap_reassemble_sdu returns and to avoid the race + * condition, for example: + * + * The current thread calls: + * l2cap_reassemble_sdu + * chan->ops->recv == l2cap_sock_recv_cb + * __sock_queue_rcv_skb + * Another thread calls: + * bt_sock_recvmsg + * skb_recv_datagram + * skb_free_datagram + * Then the current thread tries to access control, but + * it was freed by skb_free_datagram. + */ + local_control = *control; err = l2cap_reassemble_sdu(chan, skb, control); if (err) break; - if (control->final) { + if (local_control.final) { if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state)) { - control->final = 0; - l2cap_retransmit_all(chan, control); + local_control.final = 0; + l2cap_retransmit_all(chan, &local_control); l2cap_ertm_send(chan); } } @@ -6643,11 +6705,27 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff *skb) { + /* l2cap_reassemble_sdu may free skb, hence invalidate control, so store + * the txseq field in advance to use it after l2cap_reassemble_sdu + * returns and to avoid the race condition, for example: + * + * The current thread calls: + * l2cap_reassemble_sdu + * chan->ops->recv == l2cap_sock_recv_cb + * __sock_queue_rcv_skb + * Another thread calls: + * bt_sock_recvmsg + * skb_recv_datagram + * skb_free_datagram + * Then the current thread tries to access control, but it was freed by + * skb_free_datagram. + */ + u16 txseq = control->txseq; + BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb, chan->rx_state); - if (l2cap_classify_txseq(chan, control->txseq) == - L2CAP_TXSEQ_EXPECTED) { + if (l2cap_classify_txseq(chan, txseq) == L2CAP_TXSEQ_EXPECTED) { l2cap_pass_to_tx(chan, control); BT_DBG("buffer_seq %d->%d", chan->buffer_seq, @@ -6670,8 +6748,8 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, } } - chan->last_acked_seq = control->txseq; - chan->expected_tx_seq = __next_seq(chan, control->txseq); + chan->last_acked_seq = txseq; + chan->expected_tx_seq = __next_seq(chan, txseq); return 0; } @@ -6925,6 +7003,7 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, return; } + l2cap_chan_hold(chan); l2cap_chan_lock(chan); } else { BT_DBG("unknown cid 0x%4.4x", cid); @@ -6983,6 +7062,7 @@ drop: done: l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, @@ -7386,7 +7466,7 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, if (src_type != c->src_type) continue; - l2cap_chan_hold(c); + c = l2cap_chan_hold_unless_zero(c); read_unlock(&chan_list_lock); return c; } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 08e9f332adad..9eea2af9a8e1 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -45,6 +45,7 @@ static const struct proto_ops l2cap_sock_ops; static void l2cap_sock_init(struct sock *sk, struct sock *parent); static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern); +static void l2cap_sock_cleanup_listen(struct sock *parent); bool l2cap_is_socket(struct socket *sock) { @@ -404,7 +405,8 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct l2cap_options opts; struct l2cap_conninfo cinfo; - int len, err = 0; + int err = 0; + size_t len; u32 opt; BT_DBG("sk %p", sk); @@ -449,7 +451,7 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; - len = min_t(unsigned int, len, sizeof(opts)); + len = min(len, sizeof(opts)); if (copy_to_user(optval, (char *) &opts, len)) err = -EFAULT; @@ -499,7 +501,7 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, cinfo.hci_handle = chan->conn->hcon->handle; memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3); - len = min_t(unsigned int, len, sizeof(cinfo)); + len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *) &cinfo, len)) err = -EFAULT; @@ -1224,6 +1226,7 @@ static int l2cap_sock_release(struct socket *sock) if (!sk) return 0; + l2cap_sock_cleanup_listen(sk); bt_sock_unlink(&l2cap_sk_list, sk); err = l2cap_sock_shutdown(sock, 2); @@ -1433,6 +1436,14 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan, if (!skb) return ERR_PTR(err); + /* Channel lock is released before requesting new skb and then + * reacquired thus we need to recheck channel state. + */ + if (chan->state != BT_CONNECTED) { + kfree_skb(skb); + return ERR_PTR(-ENOTCONN); + } + skb->priority = sk->sk_priority; bt_cb(skb)->l2cap.chan = chan; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 83a8c48dfaa8..596fa3172642 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -594,7 +594,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) ret = rfcomm_dlc_send_frag(d, frag); if (ret < 0) { - kfree_skb(frag); + dev_kfree_skb_irq(frag); goto unlock; } @@ -1940,7 +1940,7 @@ static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s) /* Get data directly from socket receive queue without copying it. */ while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); - if (!skb_linearize(skb)) { + if (!skb_linearize(skb) && sk->sk_state != BT_CLOSED) { s = rfcomm_recv_frame(s, skb); if (!s) break; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 78a549e506b1..a3b15b4e176e 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -880,7 +880,8 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, struct sock *sk = sock->sk; struct sco_options opts; struct sco_conninfo cinfo; - int len, err = 0; + int err = 0; + size_t len; BT_DBG("sk %p", sk); @@ -902,7 +903,7 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, BT_DBG("mtu %d", opts.mtu); - len = min_t(unsigned int, len, sizeof(opts)); + len = min(len, sizeof(opts)); if (copy_to_user(optval, (char *)&opts, len)) err = -EFAULT; @@ -920,7 +921,7 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle; memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3); - len = min_t(unsigned int, len, sizeof(cinfo)); + len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *)&cinfo, len)) err = -EFAULT; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1153bbcdff72..591d146a5308 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -114,6 +114,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); + size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f085b1648e66..501f77f0f480 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -35,6 +35,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const unsigned char *dest; u16 vid = 0; + memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); + rcu_read_lock(); nf_ops = rcu_dereference(nf_br_ops); if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) { diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 86637000f275..4f8eb83976f1 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -43,7 +43,7 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb skb->protocol == htons(ETH_P_8021AD))) { int depth; - if (!__vlan_get_protocol(skb, skb->protocol, &depth)) + if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth)) goto drop; skb_set_network_header(skb, depth); @@ -118,7 +118,7 @@ static int deliver_clone(const struct net_bridge_port *prev, skb = skb_clone(skb, GFP_ATOMIC); if (!skb) { - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return -ENOMEM; } @@ -255,7 +255,7 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, skb = skb_copy(skb, GFP_ATOMIC); if (!skb) { - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e2a999890d05..6b650dfc084d 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -157,8 +157,9 @@ void br_manage_promisc(struct net_bridge *br) * This lets us disable promiscuous mode and write * this config to hw. */ - if (br->auto_cnt == 0 || - (br->auto_cnt == 1 && br_auto_port(p))) + if ((p->dev->priv_flags & IFF_UNICAST_FLT) && + (br->auto_cnt == 0 || + (br->auto_cnt == 1 && br_auto_port(p)))) br_port_clear_promisc(p); else br_port_set_promisc(p); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 464f6a619444..3d07dedd93bd 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -141,12 +141,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst && mdst->host_joined) || br_multicast_is_router(br)) { local_rcv = true; - br->dev->stats.multicast++; + DEV_STATS_INC(br->dev, multicast); } mcast_hit = true; } else { local_rcv = true; - br->dev->stats.multicast++; + DEV_STATS_INC(br->dev, multicast); } break; case BR_PKT_UNICAST: diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 19726d81025d..277b6fb92ac5 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -384,6 +384,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (rt->dst.dev == dev) { + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); goto bridged_dnat; } @@ -413,6 +414,7 @@ bridged_dnat: kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } @@ -866,11 +868,17 @@ static unsigned int ip_sabotage_in(void *priv, { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge && !nf_bridge->in_prerouting && - !netif_is_l3_master(skb->dev) && - !netif_is_l3_slave(skb->dev)) { - state->okfn(state->net, state->sk, skb); - return NF_STOLEN; + if (nf_bridge) { + if (nf_bridge->sabotage_in_done) + return NF_ACCEPT; + + if (!nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev) && + !netif_is_l3_slave(skb->dev)) { + nf_bridge->sabotage_in_done = 1; + state->okfn(state->net, state->sk, skb); + return NF_STOLEN; + } } return NF_ACCEPT; diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index e4e0c836c3f5..6b07f30675bb 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -197,6 +197,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 32bc2821027f..57f91efce0f7 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -36,18 +36,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)&initial_chain, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~(1 << NF_BR_BROUTING)) - return -EINVAL; - return 0; -} - static const struct ebt_table broute_table = { .name = "broute", .table = &initial_table, .valid_hooks = 1 << NF_BR_BROUTING, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index bcf982e12f16..7f2e620f4978 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~FILTER_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_filter = { .name = "filter", .table = &initial_table, .valid_hooks = FILTER_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0d092773f816..1743a105485c 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~NAT_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_nat = { .name = "nat", .table = &initial_table, .valid_hooks = NAT_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index d9375c52f50e..f6853fc0fcc0 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -999,9 +999,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_iterate; } - /* the table doesn't like it */ - if (t->check && (ret = t->check(newinfo, repl->valid_hooks))) + if (repl->valid_hooks != t->valid_hooks) { + ret = -EINVAL; goto free_unlock; + } if (repl->num_counters && repl->num_counters != t->private->nentries) { ret = -EINVAL; @@ -1193,11 +1194,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, if (ret != 0) goto free_chainstack; - if (table->check && table->check(newinfo, table->valid_hooks)) { - ret = -EINVAL; - goto free_chainstack; - } - table->private = newinfo; rwlock_init(&table->lock); mutex_lock(&ebt_mutex); diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index fdbed3158555..d14b2dbbd1df 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -36,7 +36,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, ktime_t tstamp = skb->tstamp; struct ip_frag_state state; struct iphdr *iph; - int err; + int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 7c9e92b2f806..0c28fa4647b7 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -87,9 +87,8 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, return nft_meta_get_init(ctx, expr, tb); } - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static struct nft_expr_type nft_meta_bridge_type; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 8fa98c62c4fc..53f19ee5642f 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1022,6 +1022,7 @@ static void caif_sock_destructor(struct sock *sk) return; } sk_stream_kill_queues(&cf_sk->sk); + WARN_ON(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 46c62dd1479b..862226be2286 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -134,6 +134,9 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, struct usb_device *usbdev; int res; + if (what == NETDEV_UNREGISTER && dev->reg_state >= NETREG_UNREGISTERED) + return 0; + /* Check whether we have a NCM device, and find its VID/PID. */ if (!(dev->dev.parent && dev->dev.parent->driver && strcmp(dev->dev.parent->driver->name, "cdc_ncm") == 0)) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 2809cbd6b7f7..d8cb4b2a076b 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -269,11 +269,15 @@ int cfctrl_linkup_request(struct cflayer *layer, default: pr_warn("Request setup of bad link type = %d\n", param->linktype); + cfpkt_destroy(pkt); return -EINVAL; } req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) + if (!req) { + cfpkt_destroy(pkt); return -ENOMEM; + } + req->client_layer = user_layer; req->cmd = CFCTRL_CMD_LINK_SETUP; req->param = *param; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 910f164dd20c..4dfac31f9466 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -314,9 +314,6 @@ static int chnl_net_open(struct net_device *dev) if (result == 0) { pr_debug("connect timeout\n"); - caif_disconnect_client(dev_net(dev), &priv->chnl); - priv->state = CAIF_DISCONNECTED; - pr_debug("state disconnected\n"); result = -ETIMEDOUT; goto error; } diff --git a/net/can/af_can.c b/net/can/af_can.c index c758a12ffe46..b396c23561d6 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -450,7 +450,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, /* insert new receiver (dev,canid,mask) -> (func,data) */ - if (dev && dev->type != ARPHRD_CAN) + if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev))) return -ENODEV; if (dev && !net_eq(net, dev_net(dev))) @@ -678,7 +678,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU)) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || skb->len != CAN_MTU)) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); goto free_skb; @@ -704,7 +704,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU)) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || skb->len != CANFD_MTU)) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); goto free_skb; diff --git a/net/can/bcm.c b/net/can/bcm.c index 63d81147fb4e..5cb4b6129263 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -276,6 +276,7 @@ static void bcm_can_tx(struct bcm_op *op) struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; + int err; /* no target device? => exit */ if (!op->ifindex) @@ -300,11 +301,11 @@ static void bcm_can_tx(struct bcm_op *op) /* send with loopback */ skb->dev = dev; can_skb_set_owner(skb, op->sk); - can_send(skb, 1); + err = can_send(skb, 1); + if (!err) + op->frames_abs++; - /* update statistics */ op->currframe++; - op->frames_abs++; /* reached last frame? */ if (op->currframe >= op->nframes) @@ -937,6 +938,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); + if (err < 0) + goto free_op; if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) @@ -946,12 +949,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, err = -EINVAL; } - if (err < 0) { - if (op->frames != &op->sframe) - kfree(op->frames); - kfree(op); - return err; - } + if (err < 0) + goto free_op; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ @@ -1022,6 +1021,12 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, bcm_tx_start_timer(op); return msg_head->nframes * op->cfsiz + MHSIZ; + +free_op: + if (op->frames != &op->sframe) + kfree(op->frames); + kfree(op); + return err; } /* @@ -1518,6 +1523,12 @@ static int bcm_release(struct socket *sock) lock_sock(sk); +#if IS_ENABLED(CONFIG_PROC_FS) + /* remove procfs entry */ + if (net->can.bcmproc_dir && bo->bcm_proc_read) + remove_proc_entry(bo->procname, net->can.bcmproc_dir); +#endif /* CONFIG_PROC_FS */ + list_for_each_entry_safe(op, next, &bo->tx_ops, list) bcm_remove_op(op); @@ -1553,12 +1564,6 @@ static int bcm_release(struct socket *sock) list_for_each_entry_safe(op, next, &bo->rx_ops, list) bcm_remove_op(op); -#if IS_ENABLED(CONFIG_PROC_FS) - /* remove procfs entry */ - if (net->can.bcmproc_dir && bo->bcm_proc_read) - remove_proc_entry(bo->procname, net->can.bcmproc_dir); -#endif /* CONFIG_PROC_FS */ - /* remove device reference */ if (bo->bound) { bo->bound = 0; diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c index f33c47327927..ca4ad6cdd5cb 100644 --- a/net/can/j1939/address-claim.c +++ b/net/can/j1939/address-claim.c @@ -165,6 +165,46 @@ static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) * leaving this function. */ ecu = j1939_ecu_get_by_name_locked(priv, name); + + if (ecu && ecu->addr == skcb->addr.sa) { + /* The ISO 11783-5 standard, in "4.5.2 - Address claim + * requirements", states: + * d) No CF shall begin, or resume, transmission on the + * network until 250 ms after it has successfully claimed + * an address except when responding to a request for + * address-claimed. + * + * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim + * prioritization" show that the CF begins the transmission + * after 250 ms from the first AC (address-claimed) message + * even if it sends another AC message during that time window + * to resolve the address contention with another CF. + * + * As stated in "4.4.2.3 - Address-claimed message": + * In order to successfully claim an address, the CF sending + * an address claimed message shall not receive a contending + * claim from another CF for at least 250 ms. + * + * As stated in "4.4.3.2 - NAME management (NM) message": + * 1) A commanding CF can + * d) request that a CF with a specified NAME transmit + * the address-claimed message with its current NAME. + * 2) A target CF shall + * d) send an address-claimed message in response to a + * request for a matching NAME + * + * Taking the above arguments into account, the 250 ms wait is + * requested only during network initialization. + * + * Do not restart the timer on AC message if both the NAME and + * the address match and so if the address has already been + * claimed (timer has expired) or the AC message has been sent + * to resolve the contention with another CF (timer is still + * running). + */ + goto out_ecu_put; + } + if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) ecu = j1939_ecu_create_locked(priv, name); diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index cea712fb2a9e..9ac2a10b1826 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -297,6 +297,7 @@ struct j1939_sock { int ifindex; struct j1939_addr addr; + spinlock_t filters_lock; struct j1939_filter *filters; int nfilters; pgn_t pgn_rx_filter; diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index ca75d1b8f415..9169ef174ff0 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -122,7 +122,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) #define J1939_CAN_ID CAN_EFF_FLAG #define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG) -static DEFINE_SPINLOCK(j1939_netdev_lock); +static DEFINE_MUTEX(j1939_netdev_lock); static struct j1939_priv *j1939_priv_create(struct net_device *ndev) { @@ -216,7 +216,7 @@ static void __j1939_rx_release(struct kref *kref) j1939_can_rx_unregister(priv); j1939_ecu_unmap_all(priv); j1939_priv_set(priv->ndev, NULL); - spin_unlock(&j1939_netdev_lock); + mutex_unlock(&j1939_netdev_lock); } /* get pointer to priv without increasing ref counter */ @@ -244,9 +244,9 @@ static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev) { struct j1939_priv *priv; - spin_lock(&j1939_netdev_lock); + mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); - spin_unlock(&j1939_netdev_lock); + mutex_unlock(&j1939_netdev_lock); return priv; } @@ -256,14 +256,14 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) struct j1939_priv *priv, *priv_new; int ret; - spin_lock(&j1939_netdev_lock); + mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); - spin_unlock(&j1939_netdev_lock); + mutex_unlock(&j1939_netdev_lock); return priv; } - spin_unlock(&j1939_netdev_lock); + mutex_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) @@ -273,29 +273,31 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) spin_lock_init(&priv->j1939_socks_lock); INIT_LIST_HEAD(&priv->j1939_socks); - spin_lock(&j1939_netdev_lock); + mutex_lock(&j1939_netdev_lock); priv_new = j1939_priv_get_by_ndev_locked(ndev); if (priv_new) { /* Someone was faster than us, use their priv and roll * back our's. */ kref_get(&priv_new->rx_kref); - spin_unlock(&j1939_netdev_lock); + mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return priv_new; } j1939_priv_set(ndev, priv); - spin_unlock(&j1939_netdev_lock); ret = j1939_can_rx_register(priv); if (ret < 0) goto out_priv_put; + mutex_unlock(&j1939_netdev_lock); return priv; out_priv_put: j1939_priv_set(ndev, NULL); + mutex_unlock(&j1939_netdev_lock); + dev_put(ndev); kfree(priv); @@ -304,7 +306,7 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) void j1939_netdev_stop(struct j1939_priv *priv) { - kref_put_lock(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock); + kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock); j1939_priv_put(priv); } @@ -332,6 +334,9 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) /* re-claim the CAN_HDR from the SKB */ cf = skb_push(skb, J1939_CAN_HDR); + /* initialize header structure */ + memset(cf, 0, J1939_CAN_HDR); + /* make it a full can frame again */ skb_put(skb, J1939_CAN_FTR + (8 - dlc)); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 51bfb220fad8..2e3dc74fea21 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -178,7 +178,10 @@ activate_next: if (!first) return; - if (WARN_ON_ONCE(j1939_session_activate(first))) { + if (j1939_session_activate(first)) { + netdev_warn_once(first->priv->ndev, + "%s: 0x%p: Identical session is already activated.\n", + __func__, first); first->err = -EBUSY; goto activate_next; } else { @@ -259,12 +262,17 @@ static bool j1939_sk_match_dst(struct j1939_sock *jsk, static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { - const struct j1939_filter *f = jsk->filters; - int nfilter = jsk->nfilters; + const struct j1939_filter *f; + int nfilter; + + spin_lock_bh(&jsk->filters_lock); + + f = jsk->filters; + nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ - return true; + goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) @@ -273,9 +281,15 @@ static bool j1939_sk_match_filter(struct j1939_sock *jsk, continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; - return true; + goto filter_match_found; } + + spin_unlock_bh(&jsk->filters_lock); return false; + +filter_match_found: + spin_unlock_bh(&jsk->filters_lock); + return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, @@ -398,6 +412,7 @@ static int j1939_sk_init(struct sock *sk) atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); + spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); @@ -700,9 +715,11 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, } lock_sock(&jsk->sk); + spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; + spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; @@ -795,7 +812,7 @@ static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, struct j1939_sk_buff_cb *skcb; int ret = 0; - if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE)) + if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) @@ -1010,6 +1027,11 @@ void j1939_sk_errqueue(struct j1939_session *session, void j1939_sk_send_loop_abort(struct sock *sk, int err) { + struct j1939_sock *jsk = j1939_sk(sk); + + if (jsk->state & J1939_SOCK_ERRQUEUE) + return; + sk->sk_err = err; sk->sk_error_report(sk); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 22f4b798d385..42fb83605d4c 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -260,6 +260,8 @@ static void __j1939_session_drop(struct j1939_session *session) static void j1939_session_destroy(struct j1939_session *session) { + struct sk_buff *skb; + if (session->err) j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT); else @@ -270,7 +272,11 @@ static void j1939_session_destroy(struct j1939_session *session) WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); WARN_ON_ONCE(!list_empty(&session->active_session_list_entry)); - skb_queue_purge(&session->skb_queue); + while ((skb = skb_dequeue(&session->skb_queue)) != NULL) { + /* drop ref taken in j1939_session_skb_queue() */ + skb_unref(skb); + kfree_skb(skb); + } __j1939_session_drop(session); j1939_priv_put(session->priv); kfree(session); @@ -332,10 +338,12 @@ static void j1939_session_skb_drop_old(struct j1939_session *session) __skb_unlink(do_skb, &session->skb_queue); /* drop ref taken in j1939_session_skb_queue() */ skb_unref(do_skb); + spin_unlock_irqrestore(&session->skb_queue.lock, flags); kfree_skb(do_skb); + } else { + spin_unlock_irqrestore(&session->skb_queue.lock, flags); } - spin_unlock_irqrestore(&session->skb_queue.lock, flags); } void j1939_session_skb_queue(struct j1939_session *session, @@ -592,7 +600,10 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); - memcpy(skb->cb, re_skcb, sizeof(skb->cb)); + /* skb->cb must be large enough to hold a j1939_sk_buff_cb structure */ + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*re_skcb)); + + memcpy(skb->cb, re_skcb, sizeof(*re_skcb)); skcb = j1939_skb_to_cb(skb); if (swap_src_dst) j1939_skbcb_swap(skcb); @@ -1079,10 +1090,6 @@ static bool j1939_session_deactivate(struct j1939_session *session) bool active; j1939_session_list_lock(priv); - /* This function should be called with a session ref-count of at - * least 2. - */ - WARN_ON_ONCE(kref_read(&session->kref) < 2); active = j1939_session_deactivate_locked(session); j1939_session_list_unlock(priv); diff --git a/net/can/raw.c b/net/can/raw.c index bb837019d172..2f500d8a0af2 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -770,6 +770,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); + struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; int ifindex; @@ -815,11 +816,20 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err < 0) goto free_skb; - skb_setup_tx_timestamp(skb, sk->sk_tsflags); + sockcm_init(&sockc, sk); + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + goto free_skb; + } skb->dev = dev; skb->sk = sk; skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; + + skb_setup_tx_timestamp(skb, sockc.tsflags); err = can_send(skb, ro->loopback); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 49726c378aab..db6320076e23 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -477,8 +477,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", ceph_pr_addr(&con->peer_addr)); con_sock_state_connecting(con); - ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), - O_NONBLOCK); + ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss), + O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", ceph_pr_addr(&con->peer_addr), diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a8481da37f1a..7412bcc94d1d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3249,17 +3249,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) int ret; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); + ret = wait_for_completion_killable(&lreq->reg_commit_wait); return ret ?: lreq->reg_commit_error; } -static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) +static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq, + unsigned long timeout) { - int ret; + long left; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); - return ret ?: lreq->notify_finish_error; + left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait, + ceph_timeout_jiffies(timeout)); + if (left <= 0) + left = left ?: -ETIMEDOUT; + else + left = lreq->notify_finish_error; /* completed */ + + return left; } /* @@ -4875,7 +4882,8 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) - ret = linger_notify_finish_wait(lreq); + ret = linger_notify_finish_wait(lreq, + msecs_to_jiffies(2 * timeout * MSEC_PER_SEC)); else dout("lreq %p failed to initiate notify %d\n", lreq, ret); diff --git a/net/core/datagram.c b/net/core/datagram.c index b0488f30f2c4..a5fc44448d60 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -778,18 +778,21 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ @@ -798,10 +801,12 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { - if (sk->sk_state == TCP_CLOSE) + int state = READ_ONCE(sk->sk_state); + + if (state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ - if (sk->sk_state == TCP_SYN_SENT) + if (state == TCP_SYN_SENT) return mask; } diff --git a/net/core/dev.c b/net/core/dev.c index a03036456221..4f39d9720981 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1940,7 +1940,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { - if (ptype->ignore_outgoing) + if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket @@ -2244,6 +2244,8 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, bool active = false; unsigned int nr_ids; + WARN_ON_ONCE(index >= dev->num_tx_queues); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; @@ -2735,8 +2737,10 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { if (in_irq() || irqs_disabled()) __dev_kfree_skb_irq(skb, reason); + else if (unlikely(reason == SKB_REASON_DROPPED)) + kfree_skb(skb); else - dev_kfree_skb(skb); + consume_skb(skb); } EXPORT_SYMBOL(__dev_kfree_skb_any); @@ -2934,7 +2938,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - return __vlan_get_protocol(skb, type, depth); + return vlan_get_protocol_and_depth(skb, type, depth); } /** @@ -3136,6 +3140,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; + if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size))) + return features & ~NETIF_F_GSO_MASK; + + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now @@ -3712,6 +3724,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) bool again = false; skb_reset_mac_header(skb); + skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); @@ -3997,8 +4010,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, u32 next_cpu; u32 ident; - /* First check into global flow table if there is a match */ - ident = sock_flow_table->ents[hash & sock_flow_table->mask]; + /* First check into global flow table if there is a match. + * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). + */ + ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); if ((ident ^ hash) & ~rps_cpu_mask) goto try_rps; @@ -4411,7 +4426,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_rx(skb); @@ -4753,7 +4768,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!netdev_tstamp_prequeue, skb); + net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5135,7 +5150,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5165,7 +5180,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); @@ -5892,7 +5907,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = dev_rx_weight; + napi->weight = READ_ONCE(dev_rx_weight); while (again) { struct sk_buff *skb; @@ -6393,8 +6408,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(netdev_budget_usecs); - int budget = netdev_budget; + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); + int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); @@ -9433,9 +9448,7 @@ void netdev_run_todo(void) BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); -#if IS_ENABLED(CONFIG_DECNET) - WARN_ON(dev->dn_ptr); -#endif + if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) @@ -9460,24 +9473,16 @@ void netdev_run_todo(void) void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { -#if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); - memcpy(stats64, netdev_stats, sizeof(*netdev_stats)); - /* zero out counters that only exist in rtnl_link_stats64 */ - memset((char *)stats64 + sizeof(*netdev_stats), 0, - sizeof(*stats64) - sizeof(*netdev_stats)); -#else - size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); - const unsigned long *src = (const unsigned long *)netdev_stats; + size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); + const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = src[i]; + dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); -#endif } EXPORT_SYMBOL(netdev_stats_to_stats64); diff --git a/net/core/devlink.c b/net/core/devlink.c index 0ac02cab3087..5bd6330ab427 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6299,7 +6299,10 @@ EXPORT_SYMBOL_GPL(devlink_free); static void devlink_port_type_warn(struct work_struct *work) { - WARN(true, "Type was not set for devlink port."); + struct devlink_port *port = container_of(to_delayed_work(work), + struct devlink_port, + type_warn_dw); + dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index e8e8389ddc96..feb946c954b6 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -180,7 +180,7 @@ out: } static const struct genl_multicast_group dropmon_mcgrps[] = { - { .name = "events", }, + { .name = "events", .cap_sys_admin = 1 }, }; static void send_dm_alert(struct work_struct *work) @@ -1539,11 +1539,13 @@ static const struct genl_ops dropmon_ops[] = { .cmd = NET_DM_CMD_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_CONFIG_GET, diff --git a/net/core/dst.c b/net/core/dst.c index 193af526e908..107aea25a564 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -81,14 +81,10 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, { struct dst_entry *dst; - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - printk_ratelimited(KERN_NOTICE "Route cache is full: " - "consider increasing sysctl " - "net.ipv[4|6].route.max_size.\n"); - return NULL; - } - } + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index cbd1885f2459..9ae38c3e2bf0 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1947,7 +1947,8 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; - WARN_ON_ONCE(!n_stats); + if (WARN_ON_ONCE(!n_stats)) + return -EOPNOTSUPP; if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; diff --git a/net/core/filter.c b/net/core/filter.c index 72bf78032f45..3c4dcdc7217e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2071,8 +2071,17 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, { unsigned int mlen = skb_network_offset(skb); + if (unlikely(skb->len <= mlen)) { + kfree_skb(skb); + return -ERANGE; + } + if (mlen) { __skb_pull(skb, mlen); + if (unlikely(!skb->len)) { + kfree_skb(skb); + return -ERANGE; + } /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that @@ -2092,7 +2101,7 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ - if (unlikely(skb->mac_header >= skb->network_header)) { + if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } @@ -2210,6 +2219,22 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) return 0; } +static void sk_msg_reset_curr(struct sk_msg *msg) +{ + u32 i = msg->sg.start; + u32 len = 0; + + do { + len += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); + if (len >= msg->sg.size) + break; + } while (i != msg->sg.end); + + msg->sg.curr = i; + msg->sg.copybreak = 0; +} + static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, @@ -2329,6 +2354,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: + sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; @@ -2462,6 +2488,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, msg->sg.data[new] = rsge; } + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } @@ -2633,6 +2660,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } @@ -2786,15 +2814,18 @@ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { + void *old_data; + /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; - skb_postpull_rcsum(skb, skb->data + off, len); - memmove(skb->data + len, skb->data, off); + old_data = skb->data; __skb_pull(skb, len); + skb_postpull_rcsum(skb, old_data + off, len); + memmove(skb->data, old_data, off); return 0; } @@ -4605,7 +4636,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; - params->ifindex = dev->ifindex; return 0; } @@ -4702,6 +4732,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here @@ -4720,7 +4751,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); } - if (!neigh) + if (!neigh || !(neigh->nud_state & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; return bpf_fib_set_fwd_params(params, neigh, dev); @@ -4827,12 +4858,13 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); - if (!neigh) + if (!neigh || !(neigh->nud_state & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; return bpf_fib_set_fwd_params(params, neigh, dev); @@ -6196,6 +6228,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 4dac27c98623..5daa72a930a9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1564,8 +1564,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb) memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, - &keys, NULL, 0, 0, 0, - FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index bf270b6a99b4..017ed82611a8 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -59,9 +59,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, ret = BPF_OK; } else { skb_reset_mac_header(skb); - ret = skb_do_redirect(skb); - if (ret == 0) - ret = BPF_REDIRECT; + skb_do_redirect(skb); + ret = BPF_REDIRECT; } break; @@ -254,7 +253,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) - return err; + return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8b6140e67e7f..e571007d083c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -224,10 +224,13 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { - int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + int max_clean = atomic_read(&tbl->gc_entries) - + READ_ONCE(tbl->gc_thresh2); + u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; + int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); @@ -242,7 +245,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || - time_after(tref, n->updated)) + !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); @@ -250,11 +253,16 @@ static int neigh_forced_gc(struct neigh_table *tbl) shrunk++; if (shrunk >= max_clean) break; + if (++loop == 16) { + if (ktime_get_ns() > tmax) + goto unlock; + loop = 0; + } } } - tbl->last_flush = jiffies; - + WRITE_ONCE(tbl->last_flush, jiffies); +unlock: write_unlock_bh(&tbl->lock); return shrunk; @@ -262,7 +270,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) static void neigh_add_timer(struct neighbour *n, unsigned long when) { + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); @@ -280,11 +298,26 @@ static int neigh_del_timer(struct neighbour *n) return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list) +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) { + struct sk_buff_head tmp; + unsigned long flags; struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) { + skb_queue_head_init(&tmp); + spin_lock_irqsave(&list->lock, flags); + skb = skb_peek(list); + while (skb != NULL) { + struct sk_buff *skb_next = skb_peek_next(skb, list); + if (net == NULL || net_eq(dev_net(skb->dev), net)) { + __skb_unlink(skb, list); + __skb_queue_tail(&tmp, skb); + } + skb = skb_next; + } + spin_unlock_irqrestore(&list->lock, flags); + + while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } @@ -358,9 +391,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - - del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); + if (skb_queue_empty_lockless(&tbl->proxy_queue)) + del_timer_sync(&tbl->proxy_timer); return 0; } @@ -384,17 +417,17 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, { struct neighbour *n = NULL; unsigned long now = jiffies; - int entries; + int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; - if (entries >= tbl->gc_thresh3 || - (entries >= tbl->gc_thresh2 && - time_after(now, tbl->last_flush + 5 * HZ))) { - if (!neigh_forced_gc(tbl) && - entries >= tbl->gc_thresh3) { + gc_thresh3 = READ_ONCE(tbl->gc_thresh3); + if (entries >= gc_thresh3 || + (entries >= READ_ONCE(tbl->gc_thresh2) && + time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); @@ -546,37 +579,6 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, } EXPORT_SYMBOL(neigh_lookup); -struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, - const void *pkey) -{ - struct neighbour *n; - unsigned int key_len = tbl->key_len; - u32 hash_val; - struct neigh_hash_table *nht; - - NEIGH_CACHE_STAT_INC(tbl, lookups); - - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); - hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift); - - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (!memcmp(n->primary_key, pkey, key_len) && - net_eq(dev_net(n->dev), net)) { - if (!refcount_inc_not_zero(&n->refcnt)) - n = NULL; - NEIGH_CACHE_STAT_INC(tbl, hits); - break; - } - } - - rcu_read_unlock_bh(); - return n; -} -EXPORT_SYMBOL(neigh_lookup_nodev); - static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u8 flags, @@ -908,13 +910,14 @@ static void neigh_periodic_work(struct work_struct *work) if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; - tbl->last_rand = jiffies; + + WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } - if (atomic_read(&tbl->entries) < tbl->gc_thresh1) + if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { @@ -933,13 +936,17 @@ static void neigh_periodic_work(struct work_struct *work) goto next_elt; } - if (time_before(n->used, n->confirmed)) + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - *np = n->next; + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -1741,7 +1748,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); + pneigh_queue_purge(&tbl->proxy_queue, NULL); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); @@ -1773,9 +1780,6 @@ static struct neigh_table *neigh_find_table(int family) case AF_INET6: tbl = neigh_tables[NEIGH_ND_TABLE]; break; - case AF_DECnet: - tbl = neigh_tables[NEIGH_DN_TABLE]; - break; } return tbl; @@ -2055,15 +2059,16 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || - nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) || - nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || - nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || - nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) + nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), + NDTA_PAD) || + nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || + nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || + nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; - long flush_delta = now - tbl->last_flush; - long rand_delta = now - tbl->last_rand; + long flush_delta = now - READ_ONCE(tbl->last_flush); + long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, @@ -2071,7 +2076,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_proxy_qlen = tbl->proxy_queue.qlen, + .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; rcu_read_lock_bh(); @@ -2094,17 +2099,17 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); - ndst.ndts_allocs += st->allocs; - ndst.ndts_destroys += st->destroys; - ndst.ndts_hash_grows += st->hash_grows; - ndst.ndts_res_failed += st->res_failed; - ndst.ndts_lookups += st->lookups; - ndst.ndts_hits += st->hits; - ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast; - ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; - ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; - ndst.ndts_forced_gc_runs += st->forced_gc_runs; - ndst.ndts_table_fulls += st->table_fulls; + ndst.ndts_allocs += READ_ONCE(st->allocs); + ndst.ndts_destroys += READ_ONCE(st->destroys); + ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); + ndst.ndts_res_failed += READ_ONCE(st->res_failed); + ndst.ndts_lookups += READ_ONCE(st->lookups); + ndst.ndts_hits += READ_ONCE(st->hits); + ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); + ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); + ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); + ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); + ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, @@ -2328,16 +2333,16 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout_tbl_lock; if (tb[NDTA_THRESH1]) - tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); + WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) - tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); + WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) - tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); + WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) - tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); + WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 62a972f04cef..534a53124d14 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -120,6 +120,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) static int ops_init(const struct pernet_operations *ops, struct net *net) { + struct net_generic *ng; int err = -ENOMEM; void *data = NULL; @@ -138,6 +139,12 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; + if (ops->id && ops->size) { + ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&pernet_ops_rwsem)); + ng->ptr[*ops->id] = NULL; + } + cleanup: kfree(data); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 78bbb912e502..9b263a5c0f36 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -136,6 +136,20 @@ static void queue_process(struct work_struct *work) } } +static int netif_local_xmit_active(struct net_device *dev) +{ + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + return 1; + } + + return 0; +} + static void poll_one_napi(struct napi_struct *napi) { int work; @@ -182,7 +196,10 @@ void netpoll_poll_dev(struct net_device *dev) if (!ni || down_trylock(&ni->dev_lock)) return; - if (!netif_running(dev)) { + /* Some drivers will take the same locks in poll and xmit, + * we can't poll if local CPU is already in xmit. + */ + if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1d20dd70879b..5e9bd9d80b39 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -645,19 +645,19 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_puts(seq, " Flags: "); for (i = 0; i < NR_PKT_FLAGS; i++) { - if (i == F_FLOW_SEQ) + if (i == FLOW_SEQ_SHIFT) if (!pkt_dev->cflows) continue; - if (pkt_dev->flags & (1 << i)) + if (pkt_dev->flags & (1 << i)) { seq_printf(seq, "%s ", pkt_flag_names[i]); - else if (i == F_FLOW_SEQ) - seq_puts(seq, "FLOW_RND "); - #ifdef CONFIG_XFRM - if (i == F_IPSEC && pkt_dev->spi) - seq_printf(seq, "spi:%u", pkt_dev->spi); + if (i == IPSEC_SHIFT && pkt_dev->spi) + seq_printf(seq, "spi:%u ", pkt_dev->spi); #endif + } else if (i == FLOW_SEQ_SHIFT) { + seq_puts(seq, "FLOW_RND "); + } } seq_puts(seq, "\n"); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dbc9b2f53649..ee599636f817 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -922,24 +922,27 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + - nla_total_size(0) + /* nest IFLA_VF_STATS */ - /* IFLA_VF_STATS_RX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_BROADCAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_MULTICAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + size += num_vfs * + (nla_total_size(0) + /* nest IFLA_VF_STATS */ + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64))); + } return size; } else return 0; @@ -1189,7 +1192,8 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo) + struct nlattr *vfinfo, + u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; @@ -1279,33 +1283,35 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); - memset(&vf_stats, 0, sizeof(vf_stats)); - if (dev->netdev_ops->ndo_get_vf_stats) - dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, - &vf_stats); - vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); - if (!vfstats) - goto nla_put_vf_failure; - if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, - vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, - vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { - nla_nest_cancel(skb, vfstats); - goto nla_put_vf_failure; + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); + if (!vfstats) + goto nla_put_vf_failure; + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfstats); } - nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; @@ -1338,7 +1344,7 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) return -EMSGSIZE; } @@ -1593,6 +1599,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; + struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1610,6 +1617,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; + qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, @@ -1628,8 +1636,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (dev->qdisc && - nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || + (qdisc && + nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -2026,13 +2034,27 @@ out_err: return err; } -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr) +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr) { - return nla_parse_deprecated(tb, IFLA_MAX, head, len, ifla_policy, + const struct ifinfomsg *ifmp; + const struct nlattr *attrs; + size_t len; + + ifmp = nla_data(nla_peer); + attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); + len = nla_len(nla_peer) - sizeof(struct ifinfomsg); + + if (ifmp->ifi_index < 0) { + NL_SET_ERR_MSG_ATTR(exterr, nla_peer, + "ifindex can't be negative"); + return -EINVAL; + } + + return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } -EXPORT_SYMBOL(rtnl_nla_parse_ifla); +EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { @@ -3054,9 +3076,12 @@ replay: ifname[0] = '\0'; ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) + if (ifm->ifi_index > 0) { dev = __dev_get_by_index(net, ifm->ifi_index); - else { + } else if (ifm->ifi_index < 0) { + NL_SET_ERR_MSG(extack, "ifindex can't be negative"); + return -EINVAL; + } else { if (ifname[0]) dev = __dev_get_by_name(net, ifname); else @@ -3578,7 +3603,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) + if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) @@ -3592,10 +3617,10 @@ nla_put_failure: return -EMSGSIZE; } -static inline size_t rtnl_fdb_nlmsg_size(void) +static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + - nla_total_size(ETH_ALEN) + /* NDA_LLADDR */ + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } @@ -3607,7 +3632,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC); + skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; @@ -4561,10 +4586,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; - struct nlattr *br_spec, *attr = NULL; + struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; - bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; @@ -4582,13 +4606,17 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; - have_flags = true; + br_flags_attr = attr; flags = nla_get_u16(attr); - break; + } + + if (nla_type(attr) == IFLA_BRIDGE_MODE) { + if (nla_len(attr) < sizeof(u16)) + return -EINVAL; } } } @@ -4626,8 +4654,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } - if (have_flags) - memcpy(nla_data(attr), &flags, sizeof(flags)); + if (br_flags_attr) + memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } diff --git a/net/core/scm.c b/net/core/scm.c index 31a38239c92f..a442bf63cd48 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -26,6 +26,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/errqueue.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> @@ -103,6 +104,11 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; + /* don't allow io_uring files */ + if (io_is_uring_fops(file)) { + fput(file); + return -EINVAL; + } *fpp++ = file; fpl->count++; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5bdb3cd20d61..82be36c87eb6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2115,6 +2115,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } else { /* Eaten partially. */ + if (skb_is_gso(skb) && !list->head_frag && + skb_headlen(list)) + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ @@ -3683,40 +3686,41 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; - skb_frag_t *frag = skb_shinfo(head_skb)->frags; unsigned int mss = skb_shinfo(head_skb)->gso_size; unsigned int doffset = head_skb->data - skb_mac_header(head_skb); - struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int partial_segs = 0; unsigned int headroom; unsigned int len = head_skb->len; + struct sk_buff *frag_skb; + skb_frag_t *frag; __be16 proto; bool csum, sg; - int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; - int pos; + int nfrags, pos; int dummy; - if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && - (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { - /* gso_size is untrusted, and we have a frag_list with a linear - * non head_frag head. - * - * (we assume checking the first list_skb member suffices; - * i.e if either of the list_skb members have non head_frag - * head, then the first one has too). - * - * If head_skb's headlen does not fit requested gso_size, it - * means that the frag_list members do NOT terminate on exact - * gso_size boundaries. Hence we cannot perform skb_frag_t page - * sharing. Therefore we must fallback to copying the frag_list - * skbs; we do so by disabling SG. - */ - if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) - features &= ~NETIF_F_SG; + if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && + mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { + struct sk_buff *check_skb; + + for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { + if (skb_headlen(check_skb) && !check_skb->head_frag) { + /* gso_size is untrusted, and we have a frag_list with + * a linear non head_frag item. + * + * If head_skb's headlen does not fit requested gso_size, + * it means that the frag_list members do NOT terminate + * on exact gso_size boundaries. Hence we cannot perform + * skb_frag_t page sharing. Therefore we must fallback to + * copying the frag_list skbs; we do so by disabling SG. + */ + features &= ~NETIF_F_SG; + break; + } + } } __skb_push(head_skb, doffset); @@ -3761,8 +3765,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. + * Cap len to not accidentally hit GSO_BY_FRAGS. */ - partial_segs = len / mss; + partial_segs = min(len, GSO_BY_FRAGS - 1U) / mss; if (partial_segs > 1) mss *= partial_segs; else @@ -3773,6 +3778,13 @@ normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) + return ERR_PTR(-ENOMEM); + + nfrags = skb_shinfo(head_skb)->nr_frags; + frag = skb_shinfo(head_skb)->frags; + frag_skb = head_skb; + do { struct sk_buff *nskb; skb_frag_t *nskb_frag; @@ -3797,6 +3809,10 @@ normal: (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); + nskb = skb_clone(list_skb, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -3815,12 +3831,8 @@ normal: frag++; } - nskb = skb_clone(list_skb, GFP_ATOMIC); list_skb = list_skb->next; - if (unlikely(!nskb)) - goto err; - if (unlikely(pskb_trim(nskb, len))) { kfree_skb(nskb); goto err; @@ -3885,12 +3897,16 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) goto err; while (pos < offset + len) { if (i >= nfrags) { + if (skb_orphan_frags(list_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, list_skb, + GFP_ATOMIC)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -3904,10 +3920,6 @@ normal: i--; frag--; } - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, - GFP_ATOMIC)) - goto err; list_skb = list_skb->next; } @@ -4564,7 +4576,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(sysctl_tstamp_allow_data || tsonly)) + if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) return true; read_lock_bh(&sk->sk_callback_lock); @@ -4627,6 +4639,11 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, skb = alloc_skb(0, GFP_ATOMIC); } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { + kfree_skb(skb); + return; + } } if (!skb) return; diff --git a/net/core/sock.c b/net/core/sock.c index c84f68bff7f5..daeb0e69c71b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -545,7 +545,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; @@ -701,7 +701,8 @@ bool sk_mc_loop(struct sock *sk) return false; if (!sk) return true; - switch (sk->sk_family) { + /* IPV6_ADDRFORM can change sk->sk_family under us. */ + switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_sk(sk)->mc_loop; #if IS_ENABLED(CONFIG_IPV6) @@ -1117,7 +1118,8 @@ set_rcvbuf: cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = ulval; + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); break; } @@ -1257,11 +1259,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_SNDBUF: - v.val = sk->sk_sndbuf; + v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: - v.val = sk->sk_rcvbuf; + v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: @@ -1349,7 +1351,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_RCVLOWAT: - v.val = sk->sk_rcvlowat; + v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: @@ -1443,7 +1445,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (!sock->ops->set_peek_off) return -EOPNOTSUPP; - v.val = sk->sk_peek_off; + v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); @@ -1473,17 +1475,19 @@ int sock_getsockopt(struct socket *sock, int level, int optname, #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: - v.val = sk->sk_ll_usec; + v.val = READ_ONCE(sk->sk_ll_usec); break; #endif case SO_MAX_PACING_RATE: + /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); - v.ulval = sk->sk_max_pacing_rate; + v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ - v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); + v.val = min_t(unsigned long, ~0U, + READ_ONCE(sk->sk_max_pacing_rate)); } break; @@ -1938,7 +1942,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -1953,6 +1956,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } } sk->sk_gso_max_segs = max_segs; + sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -2085,13 +2089,24 @@ kuid_t sock_i_uid(struct sock *sk) } EXPORT_SYMBOL(sock_i_uid); -unsigned long sock_i_ino(struct sock *sk) +unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; - read_lock_bh(&sk->sk_callback_lock); + read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock_bh(&sk->sk_callback_lock); + read_unlock(&sk->sk_callback_lock); + return ino; +} +EXPORT_SYMBOL(__sock_i_ino); + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + local_bh_disable(); + ino = __sock_i_ino(sk); + local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); @@ -2208,9 +2223,9 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } @@ -2238,7 +2253,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, goto failure; err = -EPIPE; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) @@ -2288,6 +2303,7 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -2617,7 +2633,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); - if (sk_under_memory_pressure(sk) && + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } @@ -2638,7 +2654,7 @@ EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { - sk->sk_peek_off = val; + WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); @@ -2882,7 +2898,14 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer) } EXPORT_SYMBOL(sk_stop_timer); -void sock_init_data(struct socket *sock, struct sock *sk) +void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) +{ + if (del_timer_sync(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer_sync); + +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; @@ -2901,11 +2924,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; - sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); - sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } + sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) @@ -2946,7 +2968,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_busy_read; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; @@ -2963,6 +2985,16 @@ void sock_init_data(struct socket *sock, struct sock *sk) refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } +EXPORT_SYMBOL(sock_init_data_uid); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + kuid_t uid = sock ? + SOCK_INODE(sock)->i_uid : + make_kuid(sock_net(sk)->user_ns, 0); + + sock_init_data_uid(sock, sk, uid); +} EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index c13ffbd33d8d..315a5200d170 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -188,7 +188,7 @@ int sock_diag_register(const struct sock_diag_handler *hndl) if (sock_diag_handlers[hndl->family]) err = -EBUSY; else - sock_diag_handlers[hndl->family] = hndl; + WRITE_ONCE(sock_diag_handlers[hndl->family], hndl); mutex_unlock(&sock_diag_table_mutex); return err; @@ -204,7 +204,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld) mutex_lock(&sock_diag_table_mutex); BUG_ON(sock_diag_handlers[family] != hnld); - sock_diag_handlers[family] = NULL; + WRITE_ONCE(sock_diag_handlers[family], NULL); mutex_unlock(&sock_diag_table_mutex); } EXPORT_SYMBOL_GPL(sock_diag_unregister); @@ -222,7 +222,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); - if (sock_diag_handlers[req->sdiag_family] == NULL) + if (READ_ONCE(sock_diag_handlers[req->sdiag_family]) == NULL) sock_load_diag_module(req->sdiag_family, 0); mutex_lock(&sock_diag_table_mutex); @@ -281,12 +281,12 @@ static int sock_diag_bind(struct net *net, int group) switch (group) { case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET]) + if (!READ_ONCE(sock_diag_handlers[AF_INET])) sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET6]) + if (!READ_ONCE(sock_diag_handlers[AF_INET6])) sock_load_diag_module(AF_INET6, 0); break; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 2646e8f98f67..f9e9212ff7e5 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -115,7 +115,6 @@ static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); - preempt_disable(); rcu_read_lock(); } @@ -123,7 +122,6 @@ static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); - preempt_enable(); release_sock(sk); } @@ -279,11 +277,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { + sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); + sock_put(sk); } } @@ -321,6 +321,9 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + raw_spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -654,6 +657,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_htab_elem *elem; int ret = -ENOENT; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); diff --git a/net/core/stream.c b/net/core/stream.c index a166a32b411f..cd60746877b1 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - remove_wait_queue(sk_sleep(sk), &wait); + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: @@ -195,6 +196,12 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); + /* Next, the error queue. + * We need to use queue lock, because other threads might + * add packets to the queue without socket lock being held. + */ + skb_queue_purge(&sk->sk_error_queue); + /* Next, the write queue. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); @@ -202,7 +209,6 @@ void sk_stream_kill_queues(struct sock *sk) sk_mem_reclaim(sk); WARN_ON(sk->sk_wmem_queued); - WARN_ON(sk->sk_forward_alloc); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 48041f50ecfb..586598887095 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -238,14 +238,17 @@ static int set_default_qdisc(struct ctl_table *table, int write, static int proc_do_dev_weight(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret != 0) - return ret; - - dev_rx_weight = weight_p * dev_weight_rx_bias; - dev_tx_weight = weight_p * dev_weight_tx_bias; + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); return ret; } diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index b53d5e1d026f..71e97e2a3684 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -946,7 +946,7 @@ static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, - tb[DCB_ATTR_BCN], dcbnl_pfc_up_nest, + tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index cb818617699c..8c9a63ef7585 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -288,6 +288,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len); +void dccp_destruct_common(struct sock *sk); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); void dccp_destroy_sock(struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 7cf903f9e29a..944cc34f707d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -130,6 +130,8 @@ failure: * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; @@ -241,12 +243,12 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) int err; struct net *net = dev_net(skb->dev); - /* Only need dccph_dport & dccph_sport which are the first - * 4 bytes in dccp header. - * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us. - */ - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + if (!pskb_may_pull(skb, offset + sizeof(*dh))) + return -EINVAL; + dh = (struct dccp_hdr *)(skb->data + offset); + if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) + return -EINVAL; + iph = (struct iphdr *)skb->data; dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet_lookup_established(net, &dccp_hashinfo, @@ -609,9 +611,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_parse_options(sk, dreq, skb)) goto drop_and_free; - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - ireq = inet_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -619,6 +618,9 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ireq_family = AF_INET; ireq->ir_iif = sk->sk_bound_dev_if; + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + /* * Step 3: Process LISTEN state * diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 7c24927e9c2c..c3a378ef95aa 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -67,7 +67,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; + const struct ipv6hdr *hdr; const struct dccp_hdr *dh; struct dccp_sock *dp; struct ipv6_pinfo *np; @@ -76,12 +76,12 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, __u64 seq; struct net *net = dev_net(skb->dev); - /* Only need dccph_dport & dccph_sport which are the first - * 4 bytes in dccp header. - * Our caller (icmpv6_notify()) already pulled 8 bytes for us. - */ - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + if (!pskb_may_pull(skb, offset + sizeof(*dh))) + return -EINVAL; + dh = (struct dccp_hdr *)(skb->data + offset); + if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) + return -EINVAL; + hdr = (const struct ipv6hdr *)skb->data; dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet6_lookup_established(net, &dccp_hashinfo, @@ -349,15 +349,15 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_parse_options(sk, dreq, skb)) goto drop_and_free; - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ireq->ireq_family = AF_INET6; ireq->ir_mark = inet_request_mark(sk, skb); + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { @@ -541,11 +541,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); } return newsk; @@ -605,7 +603,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) @@ -669,7 +667,6 @@ ipv6_pktoptions: np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &DCCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); memmove(IP6CB(opt_skb), &DCCP_SKB_CB(opt_skb)->header.h6, sizeof(struct inet6_skb_parm)); @@ -957,6 +954,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; @@ -1001,6 +1000,12 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { #endif }; +static void dccp_v6_sk_destruct(struct sock *sk) +{ + dccp_destruct_common(sk); + inet6_sock_destruct(sk); +} + /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ @@ -1013,17 +1018,12 @@ static int dccp_v6_init_sock(struct sock *sk) if (unlikely(!dccp_v6_ctl_sock_initialized)) dccp_v6_ctl_sock_initialized = 1; inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_destruct = dccp_v6_sk_destruct; } return err; } -static void dccp_v6_destroy_sock(struct sock *sk) -{ - dccp_destroy_sock(sk); - inet6_destroy_sock(sk); -} - static struct timewait_sock_ops dccp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct dccp6_timewait_sock), }; @@ -1046,7 +1046,7 @@ static struct proto dccp_v6_prot = { .accept = inet_csk_accept, .get_port = inet_csk_get_port, .shutdown = dccp_shutdown, - .destroy = dccp_v6_destroy_sock, + .destroy = dccp_destroy_sock, .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), diff --git a/net/dccp/options.c b/net/dccp/options.c index 3b42f5c6a63d..9fed0ae21e63 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -56,7 +56,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, (dh->dccph_doff * 4); struct dccp_options_received *opt_recv = &dp->dccps_options_received; unsigned char opt, len; - unsigned char *uninitialized_var(value); + unsigned char *value; u32 elapsed_time; __be32 opt_val; int rc; diff --git a/net/dccp/output.c b/net/dccp/output.c index 6433187a5cc4..7490b4f09bbb 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -185,7 +185,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; - dp->dccps_mss_cache = cur_mps; + WRITE_ONCE(dp->dccps_mss_cache, cur_mps); return cur_mps; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index cb61a9d281f6..491b148afa8f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -171,12 +171,18 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); -static void dccp_sk_destruct(struct sock *sk) +void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; +} +EXPORT_SYMBOL_GPL(dccp_destruct_common); + +static void dccp_sk_destruct(struct sock *sk) +{ + dccp_destruct_common(sk); inet_sock_destruct(sk); } @@ -318,11 +324,15 @@ EXPORT_SYMBOL_GPL(dccp_disconnect); __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask; struct sock *sk = sock->sk; + __poll_t mask; + u8 shutdown; + int state; sock_poll_wait(file, sock, wait); - if (sk->sk_state == DCCP_LISTEN) + + state = inet_sk_state_load(sk); + if (state == DCCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -331,20 +341,21 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, */ mask = 0; - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) mask = EPOLLERR; + shutdown = READ_ONCE(sk->sk_shutdown); - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED) + if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected? */ - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { + if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { if (atomic_read(&sk->sk_rmem_alloc) > 0) mask |= EPOLLIN | EPOLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (!(shutdown & SEND_SHUTDOWN)) { if (sk_stream_is_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ @@ -362,7 +373,6 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, } return mask; } - EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) @@ -638,7 +648,7 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: - val = dp->dccps_mss_cache; + val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); @@ -760,16 +770,11 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) trace_dccp_probe(sk, len); - if (len > dp->dccps_mss_cache) + if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); - if (dccp_qpolicy_full(sk)) { - rc = -EAGAIN; - goto out_release; - } - timeo = sock_sndtimeo(sk, noblock); /* @@ -788,11 +793,22 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (skb == NULL) goto out_release; + if (dccp_qpolicy_full(sk)) { + rc = -EAGAIN; + goto out_discard; + } + if (sk->sk_state == DCCP_CLOSED) { rc = -ENOTCONN; goto out_discard; } + /* We need to check dccps_mss_cache after socket is locked. */ + if (len > dp->dccps_mss_cache) { + rc = -EMSGSIZE; + goto out_discard; + } + skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig deleted file mode 100644 index 0935453ccfd5..000000000000 --- a/net/decnet/Kconfig +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# DECnet configuration -# -config DECNET - tristate "DECnet Support" - ---help--- - The DECnet networking protocol was used in many products made by - Digital (now Compaq). It provides reliable stream and sequenced - packet communications over which run a variety of services similar - to those which run over TCP/IP. - - To find some tools to use with the kernel layer support, please - look at Patrick Caulfield's web site: - <http://linux-decnet.sourceforge.net/>. - - More detailed documentation is available in - <file:Documentation/networking/decnet.txt>. - - Be sure to say Y to "/proc file system support" and "Sysctl support" - below when using DECnet, since you will need sysctl support to aid - in configuration at run time. - - The DECnet code is also available as a module ( = code which can be - inserted in and removed from the running kernel whenever you want). - The module is called decnet. - -config DECNET_ROUTER - bool "DECnet: router support" - depends on DECNET - select FIB_RULES - ---help--- - Add support for turning your DECnet Endnode into a level 1 or 2 - router. This is an experimental, but functional option. If you - do say Y here, then make sure that you also say Y to "Kernel/User - network link driver", "Routing messages" and "Network packet - filtering". The first two are required to allow configuration via - rtnetlink (you will need Alexey Kuznetsov's iproute2 package - from <ftp://ftp.tux.org/pub/net/ip-routing/>). The "Network packet - filtering" option will be required for the forthcoming routing daemon - to work. - - See <file:Documentation/networking/decnet.txt> for more information. diff --git a/net/decnet/Makefile b/net/decnet/Makefile deleted file mode 100644 index 07b38e441b2d..000000000000 --- a/net/decnet/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -obj-$(CONFIG_DECNET) += decnet.o - -decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \ - dn_route.o dn_dev.o dn_neigh.o dn_timer.o -decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o -decnet-y += sysctl_net_decnet.o - -obj-$(CONFIG_NETFILTER) += netfilter/ diff --git a/net/decnet/README b/net/decnet/README deleted file mode 100644 index 60e7ec88c81f..000000000000 --- a/net/decnet/README +++ /dev/null @@ -1,8 +0,0 @@ - Linux DECnet Project - ====================== - -The documentation for this kernel subsystem is available in the -Documentation/networking subdirectory of this distribution and also -on line at http://www.chygwyn.com/DECnet/ - -Steve Whitehouse <SteveW@ACM.org> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c deleted file mode 100644 index b9b847dc097c..000000000000 --- a/net/decnet/af_decnet.c +++ /dev/null @@ -1,2400 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Socket Layer Interface - * - * Authors: Eduardo Marcelo Serrat <emserrat@geocities.com> - * Patrick Caulfield <patrick@pandh.demon.co.uk> - * - * Changes: - * Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's - * version of the code. Original copyright preserved - * below. - * Steve Whitehouse: Some bug fixes, cleaning up some code to make it - * compatible with my routing layer. - * Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick - * Caulfield. - * Steve Whitehouse: Further bug fixes, checking module code still works - * with new routing layer. - * Steve Whitehouse: Additional set/get_sockopt() calls. - * Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new - * code. - * Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like - * way. Didn't manage it entirely, but its better. - * Steve Whitehouse: ditto for sendmsg(). - * Steve Whitehouse: A selection of bug fixes to various things. - * Steve Whitehouse: Added TIOCOUTQ ioctl. - * Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username. - * Steve Whitehouse: Fixes to connect() error returns. - * Patrick Caulfield: Fixes to delayed acceptance logic. - * David S. Miller: New socket locking - * Steve Whitehouse: Socket list hashing/locking - * Arnaldo C. Melo: use capable, not suser - * Steve Whitehouse: Removed unused code. Fix to use sk->allocation - * when required. - * Patrick Caulfield: /proc/net/decnet now has object name/number - * Steve Whitehouse: Fixed local port allocation, hashed sk list - * Matthew Wilcox: Fixes for dn_ioctl() - * Steve Whitehouse: New connect/accept logic to allow timeouts and - * prepare for sendpage etc. - */ - - -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - - -HISTORY: - -Version Kernel Date Author/Comments -------- ------ ---- --------------- -Version 0.0.1 2.0.30 01-dic-97 Eduardo Marcelo Serrat - (emserrat@geocities.com) - - First Development of DECnet Socket La- - yer for Linux. Only supports outgoing - connections. - -Version 0.0.2 2.1.105 20-jun-98 Patrick J. Caulfield - (patrick@pandh.demon.co.uk) - - Port to new kernel development version. - -Version 0.0.3 2.1.106 25-jun-98 Eduardo Marcelo Serrat - (emserrat@geocities.com) - _ - Added support for incoming connections - so we can start developing server apps - on Linux. - - - Module Support -Version 0.0.4 2.1.109 21-jul-98 Eduardo Marcelo Serrat - (emserrat@geocities.com) - _ - Added support for X11R6.4. Now we can - use DECnet transport for X on Linux!!! - - -Version 0.0.5 2.1.110 01-aug-98 Eduardo Marcelo Serrat - (emserrat@geocities.com) - Removed bugs on flow control - Removed bugs on incoming accessdata - order - - -Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat - dn_recvmsg fixes - - Patrick J. Caulfield - dn_bind fixes -*******************************************************************************/ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/kernel.h> -#include <linux/sched/signal.h> -#include <linux/timer.h> -#include <linux/string.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/inet.h> -#include <linux/route.h> -#include <linux/netfilter.h> -#include <linux/seq_file.h> -#include <net/sock.h> -#include <net/tcp_states.h> -#include <net/flow.h> -#include <asm/ioctls.h> -#include <linux/capability.h> -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/init.h> -#include <linux/poll.h> -#include <linux/jiffies.h> -#include <net/net_namespace.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/fib_rules.h> -#include <net/tcp.h> -#include <net/dn.h> -#include <net/dn_nsp.h> -#include <net/dn_dev.h> -#include <net/dn_route.h> -#include <net/dn_fib.h> -#include <net/dn_neigh.h> - -struct dn_sock { - struct sock sk; - struct dn_scp scp; -}; - -static void dn_keepalive(struct sock *sk); - -#define DN_SK_HASH_SHIFT 8 -#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT) -#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) - - -static const struct proto_ops dn_proto_ops; -static DEFINE_RWLOCK(dn_hash_lock); -static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; -static struct hlist_head dn_wild_sk; -static atomic_long_t decnet_memory_allocated; - -static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); -static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); - -static struct hlist_head *dn_find_list(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - if (scp->addr.sdn_flags & SDF_WILD) - return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL; - - return &dn_sk_hash[le16_to_cpu(scp->addrloc) & DN_SK_HASH_MASK]; -} - -/* - * Valid ports are those greater than zero and not already in use. - */ -static int check_port(__le16 port) -{ - struct sock *sk; - - if (port == 0) - return -1; - - sk_for_each(sk, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) { - struct dn_scp *scp = DN_SK(sk); - if (scp->addrloc == port) - return -1; - } - return 0; -} - -static unsigned short port_alloc(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - static unsigned short port = 0x2000; - unsigned short i_port = port; - - while(check_port(cpu_to_le16(++port)) != 0) { - if (port == i_port) - return 0; - } - - scp->addrloc = cpu_to_le16(port); - - return 1; -} - -/* - * Since this is only ever called from user - * level, we don't need a write_lock() version - * of this. - */ -static int dn_hash_sock(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - struct hlist_head *list; - int rv = -EUSERS; - - BUG_ON(sk_hashed(sk)); - - write_lock_bh(&dn_hash_lock); - - if (!scp->addrloc && !port_alloc(sk)) - goto out; - - rv = -EADDRINUSE; - if ((list = dn_find_list(sk)) == NULL) - goto out; - - sk_add_node(sk, list); - rv = 0; -out: - write_unlock_bh(&dn_hash_lock); - return rv; -} - -static void dn_unhash_sock(struct sock *sk) -{ - write_lock(&dn_hash_lock); - sk_del_node_init(sk); - write_unlock(&dn_hash_lock); -} - -static void dn_unhash_sock_bh(struct sock *sk) -{ - write_lock_bh(&dn_hash_lock); - sk_del_node_init(sk); - write_unlock_bh(&dn_hash_lock); -} - -static struct hlist_head *listen_hash(struct sockaddr_dn *addr) -{ - int i; - unsigned int hash = addr->sdn_objnum; - - if (hash == 0) { - hash = addr->sdn_objnamel; - for(i = 0; i < le16_to_cpu(addr->sdn_objnamel); i++) { - hash ^= addr->sdn_objname[i]; - hash ^= (hash << 3); - } - } - - return &dn_sk_hash[hash & DN_SK_HASH_MASK]; -} - -/* - * Called to transform a socket from bound (i.e. with a local address) - * into a listening socket (doesn't need a local port number) and rehashes - * based upon the object name/number. - */ -static void dn_rehash_sock(struct sock *sk) -{ - struct hlist_head *list; - struct dn_scp *scp = DN_SK(sk); - - if (scp->addr.sdn_flags & SDF_WILD) - return; - - write_lock_bh(&dn_hash_lock); - sk_del_node_init(sk); - DN_SK(sk)->addrloc = 0; - list = listen_hash(&DN_SK(sk)->addr); - sk_add_node(sk, list); - write_unlock_bh(&dn_hash_lock); -} - -int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type) -{ - int len = 2; - - *buf++ = type; - - switch (type) { - case 0: - *buf++ = sdn->sdn_objnum; - break; - case 1: - *buf++ = 0; - *buf++ = le16_to_cpu(sdn->sdn_objnamel); - memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel)); - len = 3 + le16_to_cpu(sdn->sdn_objnamel); - break; - case 2: - memset(buf, 0, 5); - buf += 5; - *buf++ = le16_to_cpu(sdn->sdn_objnamel); - memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel)); - len = 7 + le16_to_cpu(sdn->sdn_objnamel); - break; - } - - return len; -} - -/* - * On reception of usernames, we handle types 1 and 0 for destination - * addresses only. Types 2 and 4 are used for source addresses, but the - * UIC, GIC are ignored and they are both treated the same way. Type 3 - * is never used as I've no idea what its purpose might be or what its - * format is. - */ -int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt) -{ - unsigned char type; - int size = len; - int namel = 12; - - sdn->sdn_objnum = 0; - sdn->sdn_objnamel = cpu_to_le16(0); - memset(sdn->sdn_objname, 0, DN_MAXOBJL); - - if (len < 2) - return -1; - - len -= 2; - *fmt = *data++; - type = *data++; - - switch (*fmt) { - case 0: - sdn->sdn_objnum = type; - return 2; - case 1: - namel = 16; - break; - case 2: - len -= 4; - data += 4; - break; - case 4: - len -= 8; - data += 8; - break; - default: - return -1; - } - - len -= 1; - - if (len < 0) - return -1; - - sdn->sdn_objnamel = cpu_to_le16(*data++); - len -= le16_to_cpu(sdn->sdn_objnamel); - - if ((len < 0) || (le16_to_cpu(sdn->sdn_objnamel) > namel)) - return -1; - - memcpy(sdn->sdn_objname, data, le16_to_cpu(sdn->sdn_objnamel)); - - return size - len; -} - -struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr) -{ - struct hlist_head *list = listen_hash(addr); - struct sock *sk; - - read_lock(&dn_hash_lock); - sk_for_each(sk, list) { - struct dn_scp *scp = DN_SK(sk); - if (sk->sk_state != TCP_LISTEN) - continue; - if (scp->addr.sdn_objnum) { - if (scp->addr.sdn_objnum != addr->sdn_objnum) - continue; - } else { - if (addr->sdn_objnum) - continue; - if (scp->addr.sdn_objnamel != addr->sdn_objnamel) - continue; - if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, le16_to_cpu(addr->sdn_objnamel)) != 0) - continue; - } - sock_hold(sk); - read_unlock(&dn_hash_lock); - return sk; - } - - sk = sk_head(&dn_wild_sk); - if (sk) { - if (sk->sk_state == TCP_LISTEN) - sock_hold(sk); - else - sk = NULL; - } - - read_unlock(&dn_hash_lock); - return sk; -} - -struct sock *dn_find_by_skb(struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct sock *sk; - struct dn_scp *scp; - - read_lock(&dn_hash_lock); - sk_for_each(sk, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) { - scp = DN_SK(sk); - if (cb->src != dn_saddr2dn(&scp->peer)) - continue; - if (cb->dst_port != scp->addrloc) - continue; - if (scp->addrrem && (cb->src_port != scp->addrrem)) - continue; - sock_hold(sk); - goto found; - } - sk = NULL; -found: - read_unlock(&dn_hash_lock); - return sk; -} - - - -static void dn_destruct(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - skb_queue_purge(&scp->data_xmit_queue); - skb_queue_purge(&scp->other_xmit_queue); - skb_queue_purge(&scp->other_receive_queue); - - dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); -} - -static unsigned long dn_memory_pressure; - -static void dn_enter_memory_pressure(struct sock *sk) -{ - if (!dn_memory_pressure) { - dn_memory_pressure = 1; - } -} - -static struct proto dn_proto = { - .name = "NSP", - .owner = THIS_MODULE, - .enter_memory_pressure = dn_enter_memory_pressure, - .memory_pressure = &dn_memory_pressure, - .memory_allocated = &decnet_memory_allocated, - .sysctl_mem = sysctl_decnet_mem, - .sysctl_wmem = sysctl_decnet_wmem, - .sysctl_rmem = sysctl_decnet_rmem, - .max_header = DN_MAX_NSP_DATA_HEADER + 64, - .obj_size = sizeof(struct dn_sock), -}; - -static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern) -{ - struct dn_scp *scp; - struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern); - - if (!sk) - goto out; - - if (sock) - sock->ops = &dn_proto_ops; - sock_init_data(sock, sk); - - sk->sk_backlog_rcv = dn_nsp_backlog_rcv; - sk->sk_destruct = dn_destruct; - sk->sk_no_check_tx = 1; - sk->sk_family = PF_DECnet; - sk->sk_protocol = 0; - sk->sk_allocation = gfp; - sk->sk_sndbuf = sysctl_decnet_wmem[1]; - sk->sk_rcvbuf = sysctl_decnet_rmem[1]; - - /* Initialization of DECnet Session Control Port */ - scp = DN_SK(sk); - scp->state = DN_O; /* Open */ - scp->numdat = 1; /* Next data seg to tx */ - scp->numoth = 1; /* Next oth data to tx */ - scp->ackxmt_dat = 0; /* Last data seg ack'ed */ - scp->ackxmt_oth = 0; /* Last oth data ack'ed */ - scp->ackrcv_dat = 0; /* Highest data ack recv*/ - scp->ackrcv_oth = 0; /* Last oth data ack rec*/ - scp->flowrem_sw = DN_SEND; - scp->flowloc_sw = DN_SEND; - scp->flowrem_dat = 0; - scp->flowrem_oth = 1; - scp->flowloc_dat = 0; - scp->flowloc_oth = 1; - scp->services_rem = 0; - scp->services_loc = 1 | NSP_FC_NONE; - scp->info_rem = 0; - scp->info_loc = 0x03; /* NSP version 4.1 */ - scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */ - scp->nonagle = 0; - scp->multi_ireq = 1; - scp->accept_mode = ACC_IMMED; - scp->addr.sdn_family = AF_DECnet; - scp->peer.sdn_family = AF_DECnet; - scp->accessdata.acc_accl = 5; - memcpy(scp->accessdata.acc_acc, "LINUX", 5); - - scp->max_window = NSP_MAX_WINDOW; - scp->snd_window = NSP_MIN_WINDOW; - scp->nsp_srtt = NSP_INITIAL_SRTT; - scp->nsp_rttvar = NSP_INITIAL_RTTVAR; - scp->nsp_rxtshift = 0; - - skb_queue_head_init(&scp->data_xmit_queue); - skb_queue_head_init(&scp->other_xmit_queue); - skb_queue_head_init(&scp->other_receive_queue); - - scp->persist = 0; - scp->persist_fxn = NULL; - scp->keepalive = 10 * HZ; - scp->keepalive_fxn = dn_keepalive; - - dn_start_slow_timer(sk); -out: - return sk; -} - -/* - * Keepalive timer. - * FIXME: Should respond to SO_KEEPALIVE etc. - */ -static void dn_keepalive(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - /* - * By checking the other_data transmit queue is empty - * we are double checking that we are not sending too - * many of these keepalive frames. - */ - if (skb_queue_empty(&scp->other_xmit_queue)) - dn_nsp_send_link(sk, DN_NOCHANGE, 0); -} - - -/* - * Timer for shutdown/destroyed sockets. - * When socket is dead & no packets have been sent for a - * certain amount of time, they are removed by this - * routine. Also takes care of sending out DI & DC - * frames at correct times. - */ -int dn_destroy_timer(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - scp->persist = dn_nsp_persist(sk); - - switch (scp->state) { - case DN_DI: - dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC); - if (scp->nsp_rxtshift >= decnet_di_count) - scp->state = DN_CN; - return 0; - - case DN_DR: - dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC); - if (scp->nsp_rxtshift >= decnet_dr_count) - scp->state = DN_DRC; - return 0; - - case DN_DN: - if (scp->nsp_rxtshift < decnet_dn_count) { - /* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */ - dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, - GFP_ATOMIC); - return 0; - } - } - - scp->persist = (HZ * decnet_time_wait); - - if (sk->sk_socket) - return 0; - - if (time_after_eq(jiffies, scp->stamp + HZ * decnet_time_wait)) { - dn_unhash_sock(sk); - sock_put(sk); - return 1; - } - - return 0; -} - -static void dn_destroy_sock(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - scp->nsp_rxtshift = 0; /* reset back off */ - - if (sk->sk_socket) { - if (sk->sk_socket->state != SS_UNCONNECTED) - sk->sk_socket->state = SS_DISCONNECTING; - } - - sk->sk_state = TCP_CLOSE; - - switch (scp->state) { - case DN_DN: - dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, - sk->sk_allocation); - scp->persist_fxn = dn_destroy_timer; - scp->persist = dn_nsp_persist(sk); - break; - case DN_CR: - scp->state = DN_DR; - goto disc_reject; - case DN_RUN: - scp->state = DN_DI; - /* fall through */ - case DN_DI: - case DN_DR: -disc_reject: - dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation); - /* fall through */ - case DN_NC: - case DN_NR: - case DN_RJ: - case DN_DIC: - case DN_CN: - case DN_DRC: - case DN_CI: - case DN_CD: - scp->persist_fxn = dn_destroy_timer; - scp->persist = dn_nsp_persist(sk); - break; - default: - printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n"); - /* fall through */ - case DN_O: - dn_stop_slow_timer(sk); - - dn_unhash_sock_bh(sk); - sock_put(sk); - - break; - } -} - -char *dn_addr2asc(__u16 addr, char *buf) -{ - unsigned short node, area; - - node = addr & 0x03ff; - area = addr >> 10; - sprintf(buf, "%hd.%hd", area, node); - - return buf; -} - - - -static int dn_create(struct net *net, struct socket *sock, int protocol, - int kern) -{ - struct sock *sk; - - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) - return -EINVAL; - - if (!net_eq(net, &init_net)) - return -EAFNOSUPPORT; - - switch (sock->type) { - case SOCK_SEQPACKET: - if (protocol != DNPROTO_NSP) - return -EPROTONOSUPPORT; - break; - case SOCK_STREAM: - break; - default: - return -ESOCKTNOSUPPORT; - } - - - if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL) - return -ENOBUFS; - - sk->sk_protocol = protocol; - - return 0; -} - - -static int -dn_release(struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (sk) { - sock_orphan(sk); - sock_hold(sk); - lock_sock(sk); - dn_destroy_sock(sk); - release_sock(sk); - sock_put(sk); - } - - return 0; -} - -static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr; - struct net_device *dev, *ldev; - int rv; - - if (addr_len != sizeof(struct sockaddr_dn)) - return -EINVAL; - - if (saddr->sdn_family != AF_DECnet) - return -EINVAL; - - if (le16_to_cpu(saddr->sdn_nodeaddrl) && (le16_to_cpu(saddr->sdn_nodeaddrl) != 2)) - return -EINVAL; - - if (le16_to_cpu(saddr->sdn_objnamel) > DN_MAXOBJL) - return -EINVAL; - - if (saddr->sdn_flags & ~SDF_WILD) - return -EINVAL; - - if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum || - (saddr->sdn_flags & SDF_WILD))) - return -EACCES; - - if (!(saddr->sdn_flags & SDF_WILD)) { - if (le16_to_cpu(saddr->sdn_nodeaddrl)) { - rcu_read_lock(); - ldev = NULL; - for_each_netdev_rcu(&init_net, dev) { - if (!dev->dn_ptr) - continue; - if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) { - ldev = dev; - break; - } - } - rcu_read_unlock(); - if (ldev == NULL) - return -EADDRNOTAVAIL; - } - } - - rv = -EINVAL; - lock_sock(sk); - if (sock_flag(sk, SOCK_ZAPPED)) { - memcpy(&scp->addr, saddr, addr_len); - sock_reset_flag(sk, SOCK_ZAPPED); - - rv = dn_hash_sock(sk); - if (rv) - sock_set_flag(sk, SOCK_ZAPPED); - } - release_sock(sk); - - return rv; -} - - -static int dn_auto_bind(struct socket *sock) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - int rv; - - sock_reset_flag(sk, SOCK_ZAPPED); - - scp->addr.sdn_flags = 0; - scp->addr.sdn_objnum = 0; - - /* - * This stuff is to keep compatibility with Eduardo's - * patch. I hope I can dispense with it shortly... - */ - if ((scp->accessdata.acc_accl != 0) && - (scp->accessdata.acc_accl <= 12)) { - - scp->addr.sdn_objnamel = cpu_to_le16(scp->accessdata.acc_accl); - memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, le16_to_cpu(scp->addr.sdn_objnamel)); - - scp->accessdata.acc_accl = 0; - memset(scp->accessdata.acc_acc, 0, 40); - } - /* End of compatibility stuff */ - - scp->addr.sdn_add.a_len = cpu_to_le16(2); - rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr); - if (rv == 0) { - rv = dn_hash_sock(sk); - if (rv) - sock_set_flag(sk, SOCK_ZAPPED); - } - - return rv; -} - -static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) -{ - struct dn_scp *scp = DN_SK(sk); - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int err; - - if (scp->state != DN_CR) - return -EINVAL; - - scp->state = DN_CC; - scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk)); - dn_send_conn_conf(sk, allocation); - - add_wait_queue(sk_sleep(sk), &wait); - for(;;) { - release_sock(sk); - if (scp->state == DN_CC) - *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); - lock_sock(sk); - err = 0; - if (scp->state == DN_RUN) - break; - err = sock_error(sk); - if (err) - break; - err = sock_intr_errno(*timeo); - if (signal_pending(current)) - break; - err = -EAGAIN; - if (!*timeo) - break; - } - remove_wait_queue(sk_sleep(sk), &wait); - if (err == 0) { - sk->sk_socket->state = SS_CONNECTED; - } else if (scp->state != DN_CC) { - sk->sk_socket->state = SS_UNCONNECTED; - } - return err; -} - -static int dn_wait_run(struct sock *sk, long *timeo) -{ - struct dn_scp *scp = DN_SK(sk); - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int err = 0; - - if (scp->state == DN_RUN) - goto out; - - if (!*timeo) - return -EALREADY; - - add_wait_queue(sk_sleep(sk), &wait); - for(;;) { - release_sock(sk); - if (scp->state == DN_CI || scp->state == DN_CC) - *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); - lock_sock(sk); - err = 0; - if (scp->state == DN_RUN) - break; - err = sock_error(sk); - if (err) - break; - err = sock_intr_errno(*timeo); - if (signal_pending(current)) - break; - err = -ETIMEDOUT; - if (!*timeo) - break; - } - remove_wait_queue(sk_sleep(sk), &wait); -out: - if (err == 0) { - sk->sk_socket->state = SS_CONNECTED; - } else if (scp->state != DN_CI && scp->state != DN_CC) { - sk->sk_socket->state = SS_UNCONNECTED; - } - return err; -} - -static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags) -{ - struct socket *sock = sk->sk_socket; - struct dn_scp *scp = DN_SK(sk); - int err = -EISCONN; - struct flowidn fld; - struct dst_entry *dst; - - if (sock->state == SS_CONNECTED) - goto out; - - if (sock->state == SS_CONNECTING) { - err = 0; - if (scp->state == DN_RUN) { - sock->state = SS_CONNECTED; - goto out; - } - err = -ECONNREFUSED; - if (scp->state != DN_CI && scp->state != DN_CC) { - sock->state = SS_UNCONNECTED; - goto out; - } - return dn_wait_run(sk, timeo); - } - - err = -EINVAL; - if (scp->state != DN_O) - goto out; - - if (addr == NULL || addrlen != sizeof(struct sockaddr_dn)) - goto out; - if (addr->sdn_family != AF_DECnet) - goto out; - if (addr->sdn_flags & SDF_WILD) - goto out; - - if (sock_flag(sk, SOCK_ZAPPED)) { - err = dn_auto_bind(sk->sk_socket); - if (err) - goto out; - } - - memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn)); - - err = -EHOSTUNREACH; - memset(&fld, 0, sizeof(fld)); - fld.flowidn_oif = sk->sk_bound_dev_if; - fld.daddr = dn_saddr2dn(&scp->peer); - fld.saddr = dn_saddr2dn(&scp->addr); - dn_sk_ports_copy(&fld, scp); - fld.flowidn_proto = DNPROTO_NSP; - if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0) - goto out; - dst = __sk_dst_get(sk); - sk->sk_route_caps = dst->dev->features; - sock->state = SS_CONNECTING; - scp->state = DN_CI; - scp->segsize_loc = dst_metric_advmss(dst); - - dn_nsp_send_conninit(sk, NSP_CI); - err = -EINPROGRESS; - if (*timeo) { - err = dn_wait_run(sk, timeo); - } -out: - return err; -} - -static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags) -{ - struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr; - struct sock *sk = sock->sk; - int err; - long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); - - lock_sock(sk); - err = __dn_connect(sk, addr, addrlen, &timeo, 0); - release_sock(sk); - - return err; -} - -static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags) -{ - struct dn_scp *scp = DN_SK(sk); - - switch (scp->state) { - case DN_RUN: - return 0; - case DN_CR: - return dn_confirm_accept(sk, timeo, sk->sk_allocation); - case DN_CI: - case DN_CC: - return dn_wait_run(sk, timeo); - case DN_O: - return __dn_connect(sk, addr, addrlen, timeo, flags); - } - - return -EINVAL; -} - - -static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc) -{ - unsigned char *ptr = skb->data; - - acc->acc_userl = *ptr++; - memcpy(&acc->acc_user, ptr, acc->acc_userl); - ptr += acc->acc_userl; - - acc->acc_passl = *ptr++; - memcpy(&acc->acc_pass, ptr, acc->acc_passl); - ptr += acc->acc_passl; - - acc->acc_accl = *ptr++; - memcpy(&acc->acc_acc, ptr, acc->acc_accl); - - skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3); - -} - -static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt) -{ - unsigned char *ptr = skb->data; - u16 len = *ptr++; /* yes, it's 8bit on the wire */ - - BUG_ON(len > 16); /* we've checked the contents earlier */ - opt->opt_optl = cpu_to_le16(len); - opt->opt_status = 0; - memcpy(opt->opt_data, ptr, len); - skb_pull(skb, len + 1); -} - -static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct sk_buff *skb = NULL; - int err = 0; - - add_wait_queue(sk_sleep(sk), &wait); - for(;;) { - release_sock(sk); - skb = skb_dequeue(&sk->sk_receive_queue); - if (skb == NULL) { - *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); - skb = skb_dequeue(&sk->sk_receive_queue); - } - lock_sock(sk); - if (skb != NULL) - break; - err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - break; - err = sock_intr_errno(*timeo); - if (signal_pending(current)) - break; - err = -EAGAIN; - if (!*timeo) - break; - } - remove_wait_queue(sk_sleep(sk), &wait); - - return skb == NULL ? ERR_PTR(err) : skb; -} - -static int dn_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) -{ - struct sock *sk = sock->sk, *newsk; - struct sk_buff *skb = NULL; - struct dn_skb_cb *cb; - unsigned char menuver; - int err = 0; - unsigned char type; - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - struct dst_entry *dst; - - lock_sock(sk); - - if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) { - release_sock(sk); - return -EINVAL; - } - - skb = skb_dequeue(&sk->sk_receive_queue); - if (skb == NULL) { - skb = dn_wait_for_connect(sk, &timeo); - if (IS_ERR(skb)) { - release_sock(sk); - return PTR_ERR(skb); - } - } - - cb = DN_SKB_CB(skb); - sk->sk_ack_backlog--; - newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern); - if (newsk == NULL) { - release_sock(sk); - kfree_skb(skb); - return -ENOBUFS; - } - release_sock(sk); - - dst = skb_dst(skb); - sk_dst_set(newsk, dst); - skb_dst_set(skb, NULL); - - DN_SK(newsk)->state = DN_CR; - DN_SK(newsk)->addrrem = cb->src_port; - DN_SK(newsk)->services_rem = cb->services; - DN_SK(newsk)->info_rem = cb->info; - DN_SK(newsk)->segsize_rem = cb->segsize; - DN_SK(newsk)->accept_mode = DN_SK(sk)->accept_mode; - - if (DN_SK(newsk)->segsize_rem < 230) - DN_SK(newsk)->segsize_rem = 230; - - if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE) - DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd; - - newsk->sk_state = TCP_LISTEN; - memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn)); - - /* - * If we are listening on a wild socket, we don't want - * the newly created socket on the wrong hash queue. - */ - DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD; - - skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type)); - skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type)); - *(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src; - *(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst; - - menuver = *skb->data; - skb_pull(skb, 1); - - if (menuver & DN_MENUVER_ACC) - dn_access_copy(skb, &(DN_SK(newsk)->accessdata)); - - if (menuver & DN_MENUVER_USR) - dn_user_copy(skb, &(DN_SK(newsk)->conndata_in)); - - if (menuver & DN_MENUVER_PRX) - DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY; - - if (menuver & DN_MENUVER_UIC) - DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY; - - kfree_skb(skb); - - memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out), - sizeof(struct optdata_dn)); - memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out), - sizeof(struct optdata_dn)); - - lock_sock(newsk); - err = dn_hash_sock(newsk); - if (err == 0) { - sock_reset_flag(newsk, SOCK_ZAPPED); - dn_send_conn_ack(newsk); - - /* - * Here we use sk->sk_allocation since although the conn conf is - * for the newsk, the context is the old socket. - */ - if (DN_SK(newsk)->accept_mode == ACC_IMMED) - err = dn_confirm_accept(newsk, &timeo, - sk->sk_allocation); - } - release_sock(newsk); - return err; -} - - -static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) -{ - struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr; - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - - lock_sock(sk); - - if (peer) { - if ((sock->state != SS_CONNECTED && - sock->state != SS_CONNECTING) && - scp->accept_mode == ACC_IMMED) { - release_sock(sk); - return -ENOTCONN; - } - - memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn)); - } else { - memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn)); - } - - release_sock(sk); - - return sizeof(struct sockaddr_dn); -} - - -static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - __poll_t mask = datagram_poll(file, sock, wait); - - if (!skb_queue_empty_lockless(&scp->other_receive_queue)) - mask |= EPOLLRDBAND; - - return mask; -} - -static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - int err = -EOPNOTSUPP; - long amount = 0; - struct sk_buff *skb; - int val; - - switch(cmd) - { - case SIOCGIFADDR: - case SIOCSIFADDR: - return dn_dev_ioctl(cmd, (void __user *)arg); - - case SIOCATMARK: - lock_sock(sk); - val = !skb_queue_empty(&scp->other_receive_queue); - if (scp->state != DN_RUN) - val = -ENOTCONN; - release_sock(sk); - return val; - - case TIOCOUTQ: - amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); - if (amount < 0) - amount = 0; - err = put_user(amount, (int __user *)arg); - break; - - case TIOCINQ: - lock_sock(sk); - skb = skb_peek(&scp->other_receive_queue); - if (skb) { - amount = skb->len; - } else { - skb_queue_walk(&sk->sk_receive_queue, skb) - amount += skb->len; - } - release_sock(sk); - err = put_user(amount, (int __user *)arg); - break; - - default: - err = -ENOIOCTLCMD; - break; - } - - return err; -} - -static int dn_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - int err = -EINVAL; - - lock_sock(sk); - - if (sock_flag(sk, SOCK_ZAPPED)) - goto out; - - if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN)) - goto out; - - sk->sk_max_ack_backlog = backlog; - sk->sk_ack_backlog = 0; - sk->sk_state = TCP_LISTEN; - err = 0; - dn_rehash_sock(sk); - -out: - release_sock(sk); - - return err; -} - - -static int dn_shutdown(struct socket *sock, int how) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - int err = -ENOTCONN; - - lock_sock(sk); - - if (sock->state == SS_UNCONNECTED) - goto out; - - err = 0; - if (sock->state == SS_DISCONNECTING) - goto out; - - err = -EINVAL; - if (scp->state == DN_O) - goto out; - - if (how != SHUT_RDWR) - goto out; - - sk->sk_shutdown = SHUTDOWN_MASK; - dn_destroy_sock(sk); - err = 0; - -out: - release_sock(sk); - - return err; -} - -static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - int err; - - lock_sock(sk); - err = __dn_setsockopt(sock, level, optname, optval, optlen, 0); - release_sock(sk); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != DSO_LINKINFO && - optname != DSO_STREAM && optname != DSO_SEQPACKET) - err = nf_setsockopt(sk, PF_DECnet, optname, optval, optlen); -#endif - - return err; -} - -static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - long timeo; - union { - struct optdata_dn opt; - struct accessdata_dn acc; - int mode; - unsigned long win; - int val; - unsigned char services; - unsigned char info; - } u; - int err; - - if (optlen && !optval) - return -EINVAL; - - if (optlen > sizeof(u)) - return -EINVAL; - - if (copy_from_user(&u, optval, optlen)) - return -EFAULT; - - switch (optname) { - case DSO_CONDATA: - if (sock->state == SS_CONNECTED) - return -EISCONN; - if ((scp->state != DN_O) && (scp->state != DN_CR)) - return -EINVAL; - - if (optlen != sizeof(struct optdata_dn)) - return -EINVAL; - - if (le16_to_cpu(u.opt.opt_optl) > 16) - return -EINVAL; - - memcpy(&scp->conndata_out, &u.opt, optlen); - break; - - case DSO_DISDATA: - if (sock->state != SS_CONNECTED && - scp->accept_mode == ACC_IMMED) - return -ENOTCONN; - - if (optlen != sizeof(struct optdata_dn)) - return -EINVAL; - - if (le16_to_cpu(u.opt.opt_optl) > 16) - return -EINVAL; - - memcpy(&scp->discdata_out, &u.opt, optlen); - break; - - case DSO_CONACCESS: - if (sock->state == SS_CONNECTED) - return -EISCONN; - if (scp->state != DN_O) - return -EINVAL; - - if (optlen != sizeof(struct accessdata_dn)) - return -EINVAL; - - if ((u.acc.acc_accl > DN_MAXACCL) || - (u.acc.acc_passl > DN_MAXACCL) || - (u.acc.acc_userl > DN_MAXACCL)) - return -EINVAL; - - memcpy(&scp->accessdata, &u.acc, optlen); - break; - - case DSO_ACCEPTMODE: - if (sock->state == SS_CONNECTED) - return -EISCONN; - if (scp->state != DN_O) - return -EINVAL; - - if (optlen != sizeof(int)) - return -EINVAL; - - if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER)) - return -EINVAL; - - scp->accept_mode = (unsigned char)u.mode; - break; - - case DSO_CONACCEPT: - if (scp->state != DN_CR) - return -EINVAL; - timeo = sock_rcvtimeo(sk, 0); - err = dn_confirm_accept(sk, &timeo, sk->sk_allocation); - return err; - - case DSO_CONREJECT: - if (scp->state != DN_CR) - return -EINVAL; - - scp->state = DN_DR; - sk->sk_shutdown = SHUTDOWN_MASK; - dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation); - break; - - case DSO_MAXWINDOW: - if (optlen != sizeof(unsigned long)) - return -EINVAL; - if (u.win > NSP_MAX_WINDOW) - u.win = NSP_MAX_WINDOW; - if (u.win == 0) - return -EINVAL; - scp->max_window = u.win; - if (scp->snd_window > u.win) - scp->snd_window = u.win; - break; - - case DSO_NODELAY: - if (optlen != sizeof(int)) - return -EINVAL; - if (scp->nonagle == TCP_NAGLE_CORK) - return -EINVAL; - scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_OFF; - /* if (scp->nonagle == 1) { Push pending frames } */ - break; - - case DSO_CORK: - if (optlen != sizeof(int)) - return -EINVAL; - if (scp->nonagle == TCP_NAGLE_OFF) - return -EINVAL; - scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_CORK; - /* if (scp->nonagle == 0) { Push pending frames } */ - break; - - case DSO_SERVICES: - if (optlen != sizeof(unsigned char)) - return -EINVAL; - if ((u.services & ~NSP_FC_MASK) != 0x01) - return -EINVAL; - if ((u.services & NSP_FC_MASK) == NSP_FC_MASK) - return -EINVAL; - scp->services_loc = u.services; - break; - - case DSO_INFO: - if (optlen != sizeof(unsigned char)) - return -EINVAL; - if (u.info & 0xfc) - return -EINVAL; - scp->info_loc = u.info; - break; - - case DSO_LINKINFO: - case DSO_STREAM: - case DSO_SEQPACKET: - default: - return -ENOPROTOOPT; - } - - return 0; -} - -static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - int err; - - lock_sock(sk); - err = __dn_getsockopt(sock, level, optname, optval, optlen, 0); - release_sock(sk); -#ifdef CONFIG_NETFILTER - if (err == -ENOPROTOOPT && optname != DSO_STREAM && - optname != DSO_SEQPACKET && optname != DSO_CONACCEPT && - optname != DSO_CONREJECT) { - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - err = nf_getsockopt(sk, PF_DECnet, optname, optval, &len); - if (err >= 0) - err = put_user(len, optlen); - } -#endif - - return err; -} - -static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - struct linkinfo_dn link; - unsigned int r_len; - void *r_data = NULL; - unsigned int val; - - if(get_user(r_len , optlen)) - return -EFAULT; - - switch (optname) { - case DSO_CONDATA: - if (r_len > sizeof(struct optdata_dn)) - r_len = sizeof(struct optdata_dn); - r_data = &scp->conndata_in; - break; - - case DSO_DISDATA: - if (r_len > sizeof(struct optdata_dn)) - r_len = sizeof(struct optdata_dn); - r_data = &scp->discdata_in; - break; - - case DSO_CONACCESS: - if (r_len > sizeof(struct accessdata_dn)) - r_len = sizeof(struct accessdata_dn); - r_data = &scp->accessdata; - break; - - case DSO_ACCEPTMODE: - if (r_len > sizeof(unsigned char)) - r_len = sizeof(unsigned char); - r_data = &scp->accept_mode; - break; - - case DSO_LINKINFO: - if (r_len > sizeof(struct linkinfo_dn)) - r_len = sizeof(struct linkinfo_dn); - - memset(&link, 0, sizeof(link)); - - switch (sock->state) { - case SS_CONNECTING: - link.idn_linkstate = LL_CONNECTING; - break; - case SS_DISCONNECTING: - link.idn_linkstate = LL_DISCONNECTING; - break; - case SS_CONNECTED: - link.idn_linkstate = LL_RUNNING; - break; - default: - link.idn_linkstate = LL_INACTIVE; - } - - link.idn_segsize = scp->segsize_rem; - r_data = &link; - break; - - case DSO_MAXWINDOW: - if (r_len > sizeof(unsigned long)) - r_len = sizeof(unsigned long); - r_data = &scp->max_window; - break; - - case DSO_NODELAY: - if (r_len > sizeof(int)) - r_len = sizeof(int); - val = (scp->nonagle == TCP_NAGLE_OFF); - r_data = &val; - break; - - case DSO_CORK: - if (r_len > sizeof(int)) - r_len = sizeof(int); - val = (scp->nonagle == TCP_NAGLE_CORK); - r_data = &val; - break; - - case DSO_SERVICES: - if (r_len > sizeof(unsigned char)) - r_len = sizeof(unsigned char); - r_data = &scp->services_rem; - break; - - case DSO_INFO: - if (r_len > sizeof(unsigned char)) - r_len = sizeof(unsigned char); - r_data = &scp->info_rem; - break; - - case DSO_STREAM: - case DSO_SEQPACKET: - case DSO_CONACCEPT: - case DSO_CONREJECT: - default: - return -ENOPROTOOPT; - } - - if (r_data) { - if (copy_to_user(optval, r_data, r_len)) - return -EFAULT; - if (put_user(r_len, optlen)) - return -EFAULT; - } - - return 0; -} - - -static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target) -{ - struct sk_buff *skb; - int len = 0; - - if (flags & MSG_OOB) - return !skb_queue_empty(q) ? 1 : 0; - - skb_queue_walk(q, skb) { - struct dn_skb_cb *cb = DN_SKB_CB(skb); - len += skb->len; - - if (cb->nsp_flags & 0x40) { - /* SOCK_SEQPACKET reads to EOM */ - if (sk->sk_type == SOCK_SEQPACKET) - return 1; - /* so does SOCK_STREAM unless WAITALL is specified */ - if (!(flags & MSG_WAITALL)) - return 1; - } - - /* minimum data length for read exceeded */ - if (len >= target) - return 1; - } - - return 0; -} - - -static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, - int flags) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - struct sk_buff_head *queue = &sk->sk_receive_queue; - size_t target = size > 1 ? 1 : 0; - size_t copied = 0; - int rv = 0; - struct sk_buff *skb, *n; - struct dn_skb_cb *cb = NULL; - unsigned char eor = 0; - long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - lock_sock(sk); - - if (sock_flag(sk, SOCK_ZAPPED)) { - rv = -EADDRNOTAVAIL; - goto out; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) { - rv = 0; - goto out; - } - - rv = dn_check_state(sk, NULL, 0, &timeo, flags); - if (rv) - goto out; - - if (flags & ~(MSG_CMSG_COMPAT|MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { - rv = -EOPNOTSUPP; - goto out; - } - - if (flags & MSG_OOB) - queue = &scp->other_receive_queue; - - if (flags & MSG_WAITALL) - target = size; - - - /* - * See if there is data ready to read, sleep if there isn't - */ - for(;;) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - if (sk->sk_err) - goto out; - - if (!skb_queue_empty(&scp->other_receive_queue)) { - if (!(flags & MSG_OOB)) { - msg->msg_flags |= MSG_OOB; - if (!scp->other_report) { - scp->other_report = 1; - goto out; - } - } - } - - if (scp->state != DN_RUN) - goto out; - - if (signal_pending(current)) { - rv = sock_intr_errno(timeo); - goto out; - } - - if (dn_data_ready(sk, queue, flags, target)) - break; - - if (flags & MSG_DONTWAIT) { - rv = -EWOULDBLOCK; - goto out; - } - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - } - - skb_queue_walk_safe(queue, skb, n) { - unsigned int chunk = skb->len; - cb = DN_SKB_CB(skb); - - if ((chunk + copied) > size) - chunk = size - copied; - - if (memcpy_to_msg(msg, skb->data, chunk)) { - rv = -EFAULT; - break; - } - copied += chunk; - - if (!(flags & MSG_PEEK)) - skb_pull(skb, chunk); - - eor = cb->nsp_flags & 0x40; - - if (skb->len == 0) { - skb_unlink(skb, queue); - kfree_skb(skb); - /* - * N.B. Don't refer to skb or cb after this point - * in loop. - */ - if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) { - scp->flowloc_sw = DN_SEND; - dn_nsp_send_link(sk, DN_SEND, 0); - } - } - - if (eor) { - if (sk->sk_type == SOCK_SEQPACKET) - break; - if (!(flags & MSG_WAITALL)) - break; - } - - if (flags & MSG_OOB) - break; - - if (copied >= target) - break; - } - - rv = copied; - - - if (eor && (sk->sk_type == SOCK_SEQPACKET)) - msg->msg_flags |= MSG_EOR; - -out: - if (rv == 0) - rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk); - - if ((rv >= 0) && msg->msg_name) { - __sockaddr_check_size(sizeof(struct sockaddr_dn)); - memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn)); - msg->msg_namelen = sizeof(struct sockaddr_dn); - } - - release_sock(sk); - - return rv; -} - - -static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags) -{ - unsigned char fctype = scp->services_rem & NSP_FC_MASK; - if (skb_queue_len(queue) >= scp->snd_window) - return 1; - if (fctype != NSP_FC_NONE) { - if (flags & MSG_OOB) { - if (scp->flowrem_oth == 0) - return 1; - } else { - if (scp->flowrem_dat == 0) - return 1; - } - } - return 0; -} - -/* - * The DECnet spec requires that the "routing layer" accepts packets which - * are at least 230 bytes in size. This excludes any headers which the NSP - * layer might add, so we always assume that we'll be using the maximal - * length header on data packets. The variation in length is due to the - * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't - * make much practical difference. - */ -unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu) -{ - unsigned int mss = 230 - DN_MAX_NSP_DATA_HEADER; - if (dev) { - struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - mtu -= LL_RESERVED_SPACE(dev); - if (dn_db->use_long) - mtu -= 21; - else - mtu -= 6; - mtu -= DN_MAX_NSP_DATA_HEADER; - } else { - /* - * 21 = long header, 16 = guess at MAC header length - */ - mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16); - } - if (mtu > mss) - mss = mtu; - return mss; -} - -static inline unsigned int dn_current_mss(struct sock *sk, int flags) -{ - struct dst_entry *dst = __sk_dst_get(sk); - struct dn_scp *scp = DN_SK(sk); - int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem); - - /* Other data messages are limited to 16 bytes per packet */ - if (flags & MSG_OOB) - return 16; - - /* This works out the maximum size of segment we can send out */ - if (dst) { - u32 mtu = dst_mtu(dst); - mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now); - } - - return mss_now; -} - -/* - * N.B. We get the timeout wrong here, but then we always did get it - * wrong before and this is another step along the road to correcting - * it. It ought to get updated each time we pass through the routine, - * but in practise it probably doesn't matter too much for now. - */ -static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk, - unsigned long datalen, int noblock, - int *errcode) -{ - struct sk_buff *skb = sock_alloc_send_skb(sk, datalen, - noblock, errcode); - if (skb) { - skb->protocol = htons(ETH_P_DNA_RT); - skb->pkt_type = PACKET_OUTGOING; - } - return skb; -} - -static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) -{ - struct sock *sk = sock->sk; - struct dn_scp *scp = DN_SK(sk); - size_t mss; - struct sk_buff_head *queue = &scp->data_xmit_queue; - int flags = msg->msg_flags; - int err = 0; - size_t sent = 0; - int addr_len = msg->msg_namelen; - DECLARE_SOCKADDR(struct sockaddr_dn *, addr, msg->msg_name); - struct sk_buff *skb = NULL; - struct dn_skb_cb *cb; - size_t len; - unsigned char fctype; - long timeo; - - if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT)) - return -EOPNOTSUPP; - - if (addr_len && (addr_len != sizeof(struct sockaddr_dn))) - return -EINVAL; - - lock_sock(sk); - timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - /* - * The only difference between stream sockets and sequenced packet - * sockets is that the stream sockets always behave as if MSG_EOR - * has been set. - */ - if (sock->type == SOCK_STREAM) { - if (flags & MSG_EOR) { - err = -EINVAL; - goto out; - } - flags |= MSG_EOR; - } - - - err = dn_check_state(sk, addr, addr_len, &timeo, flags); - if (err) - goto out_err; - - if (sk->sk_shutdown & SEND_SHUTDOWN) { - err = -EPIPE; - if (!(flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - goto out_err; - } - - if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) - dst_negative_advice(sk); - - mss = scp->segsize_rem; - fctype = scp->services_rem & NSP_FC_MASK; - - mss = dn_current_mss(sk, flags); - - if (flags & MSG_OOB) { - queue = &scp->other_xmit_queue; - if (size > mss) { - err = -EMSGSIZE; - goto out; - } - } - - scp->persist_fxn = dn_nsp_xmit_timeout; - - while(sent < size) { - err = sock_error(sk); - if (err) - goto out; - - if (signal_pending(current)) { - err = sock_intr_errno(timeo); - goto out; - } - - /* - * Calculate size that we wish to send. - */ - len = size - sent; - - if (len > mss) - len = mss; - - /* - * Wait for queue size to go down below the window - * size. - */ - if (dn_queue_too_long(scp, queue, flags)) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - if (flags & MSG_DONTWAIT) { - err = -EWOULDBLOCK; - goto out; - } - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, - !dn_queue_too_long(scp, queue, flags), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - continue; - } - - /* - * Get a suitably sized skb. - * 64 is a bit of a hack really, but its larger than any - * link-layer headers and has served us well as a good - * guess as to their real length. - */ - skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER, - flags & MSG_DONTWAIT, &err); - - if (err) - break; - - if (!skb) - continue; - - cb = DN_SKB_CB(skb); - - skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER); - - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; - goto out; - } - - if (flags & MSG_OOB) { - cb->nsp_flags = 0x30; - if (fctype != NSP_FC_NONE) - scp->flowrem_oth--; - } else { - cb->nsp_flags = 0x00; - if (scp->seg_total == 0) - cb->nsp_flags |= 0x20; - - scp->seg_total += len; - - if (((sent + len) == size) && (flags & MSG_EOR)) { - cb->nsp_flags |= 0x40; - scp->seg_total = 0; - if (fctype == NSP_FC_SCMC) - scp->flowrem_dat--; - } - if (fctype == NSP_FC_SRC) - scp->flowrem_dat--; - } - - sent += len; - dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB); - skb = NULL; - - scp->persist = dn_nsp_persist(sk); - - } -out: - - kfree_skb(skb); - - release_sock(sk); - - return sent ? sent : err; - -out_err: - err = sk_stream_error(sk, flags, err); - release_sock(sk); - return err; -} - -static int dn_device_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - switch (event) { - case NETDEV_UP: - dn_dev_up(dev); - break; - case NETDEV_DOWN: - dn_dev_down(dev); - break; - default: - break; - } - - return NOTIFY_DONE; -} - -static struct notifier_block dn_dev_notifier = { - .notifier_call = dn_device_event, -}; - -static struct packet_type dn_dix_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_DNA_RT), - .func = dn_route_rcv, -}; - -#ifdef CONFIG_PROC_FS -struct dn_iter_state { - int bucket; -}; - -static struct sock *dn_socket_get_first(struct seq_file *seq) -{ - struct dn_iter_state *state = seq->private; - struct sock *n = NULL; - - for(state->bucket = 0; - state->bucket < DN_SK_HASH_SIZE; - ++state->bucket) { - n = sk_head(&dn_sk_hash[state->bucket]); - if (n) - break; - } - - return n; -} - -static struct sock *dn_socket_get_next(struct seq_file *seq, - struct sock *n) -{ - struct dn_iter_state *state = seq->private; - - n = sk_next(n); -try_again: - if (n) - goto out; - if (++state->bucket >= DN_SK_HASH_SIZE) - goto out; - n = sk_head(&dn_sk_hash[state->bucket]); - goto try_again; -out: - return n; -} - -static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos) -{ - struct sock *sk = dn_socket_get_first(seq); - - if (sk) { - while(*pos && (sk = dn_socket_get_next(seq, sk))) - --*pos; - } - return *pos ? NULL : sk; -} - -static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos) -{ - void *rc; - read_lock_bh(&dn_hash_lock); - rc = socket_get_idx(seq, &pos); - if (!rc) { - read_unlock_bh(&dn_hash_lock); - } - return rc; -} - -static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos) -{ - return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; -} - -static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - void *rc; - - if (v == SEQ_START_TOKEN) { - rc = dn_socket_get_idx(seq, 0); - goto out; - } - - rc = dn_socket_get_next(seq, v); - if (rc) - goto out; - read_unlock_bh(&dn_hash_lock); -out: - ++*pos; - return rc; -} - -static void dn_socket_seq_stop(struct seq_file *seq, void *v) -{ - if (v && v != SEQ_START_TOKEN) - read_unlock_bh(&dn_hash_lock); -} - -#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126) - -static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf) -{ - int i; - - switch (le16_to_cpu(dn->sdn_objnamel)) { - case 0: - sprintf(buf, "%d", dn->sdn_objnum); - break; - default: - for (i = 0; i < le16_to_cpu(dn->sdn_objnamel); i++) { - buf[i] = dn->sdn_objname[i]; - if (IS_NOT_PRINTABLE(buf[i])) - buf[i] = '.'; - } - buf[i] = 0; - } -} - -static char *dn_state2asc(unsigned char state) -{ - switch (state) { - case DN_O: - return "OPEN"; - case DN_CR: - return " CR"; - case DN_DR: - return " DR"; - case DN_DRC: - return " DRC"; - case DN_CC: - return " CC"; - case DN_CI: - return " CI"; - case DN_NR: - return " NR"; - case DN_NC: - return " NC"; - case DN_CD: - return " CD"; - case DN_RJ: - return " RJ"; - case DN_RUN: - return " RUN"; - case DN_DI: - return " DI"; - case DN_DIC: - return " DIC"; - case DN_DN: - return " DN"; - case DN_CL: - return " CL"; - case DN_CN: - return " CN"; - } - - return "????"; -} - -static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - char buf1[DN_ASCBUF_LEN]; - char buf2[DN_ASCBUF_LEN]; - char local_object[DN_MAXOBJL+3]; - char remote_object[DN_MAXOBJL+3]; - - dn_printable_object(&scp->addr, local_object); - dn_printable_object(&scp->peer, remote_object); - - seq_printf(seq, - "%6s/%04X %04d:%04d %04d:%04d %01d %-16s " - "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n", - dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->addr)), buf1), - scp->addrloc, - scp->numdat, - scp->numoth, - scp->ackxmt_dat, - scp->ackxmt_oth, - scp->flowloc_sw, - local_object, - dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->peer)), buf2), - scp->addrrem, - scp->numdat_rcv, - scp->numoth_rcv, - scp->ackrcv_dat, - scp->ackrcv_oth, - scp->flowrem_sw, - remote_object, - dn_state2asc(scp->state), - ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER")); -} - -static int dn_socket_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) { - seq_puts(seq, "Local Remote\n"); - } else { - dn_socket_format_entry(seq, v); - } - return 0; -} - -static const struct seq_operations dn_socket_seq_ops = { - .start = dn_socket_seq_start, - .next = dn_socket_seq_next, - .stop = dn_socket_seq_stop, - .show = dn_socket_seq_show, -}; -#endif - -static const struct net_proto_family dn_family_ops = { - .family = AF_DECnet, - .create = dn_create, - .owner = THIS_MODULE, -}; - -static const struct proto_ops dn_proto_ops = { - .family = AF_DECnet, - .owner = THIS_MODULE, - .release = dn_release, - .bind = dn_bind, - .connect = dn_connect, - .socketpair = sock_no_socketpair, - .accept = dn_accept, - .getname = dn_getname, - .poll = dn_poll, - .ioctl = dn_ioctl, - .listen = dn_listen, - .shutdown = dn_shutdown, - .setsockopt = dn_setsockopt, - .getsockopt = dn_getsockopt, - .sendmsg = dn_sendmsg, - .recvmsg = dn_recvmsg, - .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, -}; - -MODULE_DESCRIPTION("The Linux DECnet Network Protocol"); -MODULE_AUTHOR("Linux DECnet Project Team"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NETPROTO(PF_DECnet); - -static const char banner[] __initconst = KERN_INFO -"NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; - -static int __init decnet_init(void) -{ - int rc; - - printk(banner); - - rc = proto_register(&dn_proto, 1); - if (rc != 0) - goto out; - - dn_neigh_init(); - dn_dev_init(); - dn_route_init(); - dn_fib_init(); - - sock_register(&dn_family_ops); - dev_add_pack(&dn_dix_packet_type); - register_netdevice_notifier(&dn_dev_notifier); - - proc_create_seq_private("decnet", 0444, init_net.proc_net, - &dn_socket_seq_ops, sizeof(struct dn_iter_state), - NULL); - dn_register_sysctl(); -out: - return rc; - -} -module_init(decnet_init); - -/* - * Prevent DECnet module unloading until its fixed properly. - * Requires an audit of the code to check for memory leaks and - * initialisation problems etc. - */ -#if 0 -static void __exit decnet_exit(void) -{ - sock_unregister(AF_DECnet); - rtnl_unregister_all(PF_DECnet); - dev_remove_pack(&dn_dix_packet_type); - - dn_unregister_sysctl(); - - unregister_netdevice_notifier(&dn_dev_notifier); - - dn_route_cleanup(); - dn_dev_cleanup(); - dn_neigh_cleanup(); - dn_fib_cleanup(); - - remove_proc_entry("decnet", init_net.proc_net); - - proto_unregister(&dn_proto); - - rcu_barrier(); /* Wait for completion of call_rcu()'s */ -} -module_exit(decnet_exit); -#endif diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c deleted file mode 100644 index cca7ae712995..000000000000 --- a/net/decnet/dn_dev.c +++ /dev/null @@ -1,1438 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Device Layer - * - * Authors: Steve Whitehouse <SteveW@ACM.org> - * Eduardo Marcelo Serrat <emserrat@geocities.com> - * - * Changes: - * Steve Whitehouse : Devices now see incoming frames so they - * can mark on who it came from. - * Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour - * can now have a device specific setup func. - * Steve Whitehouse : Added /proc/sys/net/decnet/conf/<dev>/ - * Steve Whitehouse : Fixed bug which sometimes killed timer - * Steve Whitehouse : Multiple ifaddr support - * Steve Whitehouse : SIOCGIFCONF is now a compile time option - * Steve Whitehouse : /proc/sys/net/decnet/conf/<sys>/forwarding - * Steve Whitehouse : Removed timer1 - it's a user space issue now - * Patrick Caulfield : Fixed router hello message format - * Steve Whitehouse : Got rid of constant sizes for blksize for - * devices. All mtu based now. - */ - -#include <linux/capability.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/timer.h> -#include <linux/string.h> -#include <linux/if_addr.h> -#include <linux/if_arp.h> -#include <linux/if_ether.h> -#include <linux/skbuff.h> -#include <linux/sysctl.h> -#include <linux/notifier.h> -#include <linux/slab.h> -#include <linux/jiffies.h> -#include <linux/uaccess.h> -#include <net/net_namespace.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/fib_rules.h> -#include <net/netlink.h> -#include <net/dn.h> -#include <net/dn_dev.h> -#include <net/dn_route.h> -#include <net/dn_neigh.h> -#include <net/dn_fib.h> - -#define DN_IFREQ_SIZE (offsetof(struct ifreq, ifr_ifru) + sizeof(struct sockaddr_dn)) - -static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00}; -static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00}; -static char dn_hiord[ETH_ALEN] = {0xAA,0x00,0x04,0x00,0x00,0x00}; -static unsigned char dn_eco_version[3] = {0x02,0x00,0x00}; - -extern struct neigh_table dn_neigh_table; - -/* - * decnet_address is kept in network order. - */ -__le16 decnet_address = 0; - -static DEFINE_SPINLOCK(dndev_lock); -static struct net_device *decnet_default_device; -static BLOCKING_NOTIFIER_HEAD(dnaddr_chain); - -static struct dn_dev *dn_dev_create(struct net_device *dev, int *err); -static void dn_dev_delete(struct net_device *dev); -static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa); - -static int dn_eth_up(struct net_device *); -static void dn_eth_down(struct net_device *); -static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa); -static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa); - -static struct dn_dev_parms dn_dev_list[] = { -{ - .type = ARPHRD_ETHER, /* Ethernet */ - .mode = DN_DEV_BCAST, - .state = DN_DEV_S_RU, - .t2 = 1, - .t3 = 10, - .name = "ethernet", - .up = dn_eth_up, - .down = dn_eth_down, - .timer3 = dn_send_brd_hello, -}, -{ - .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */ - .mode = DN_DEV_BCAST, - .state = DN_DEV_S_RU, - .t2 = 1, - .t3 = 10, - .name = "ipgre", - .timer3 = dn_send_brd_hello, -}, -#if 0 -{ - .type = ARPHRD_X25, /* Bog standard X.25 */ - .mode = DN_DEV_UCAST, - .state = DN_DEV_S_DS, - .t2 = 1, - .t3 = 120, - .name = "x25", - .timer3 = dn_send_ptp_hello, -}, -#endif -#if 0 -{ - .type = ARPHRD_PPP, /* DECnet over PPP */ - .mode = DN_DEV_BCAST, - .state = DN_DEV_S_RU, - .t2 = 1, - .t3 = 10, - .name = "ppp", - .timer3 = dn_send_brd_hello, -}, -#endif -{ - .type = ARPHRD_DDCMP, /* DECnet over DDCMP */ - .mode = DN_DEV_UCAST, - .state = DN_DEV_S_DS, - .t2 = 1, - .t3 = 120, - .name = "ddcmp", - .timer3 = dn_send_ptp_hello, -}, -{ - .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */ - .mode = DN_DEV_BCAST, - .state = DN_DEV_S_RU, - .t2 = 1, - .t3 = 10, - .name = "loopback", - .timer3 = dn_send_brd_hello, -} -}; - -#define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list) - -#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x) - -#ifdef CONFIG_SYSCTL - -static int min_t2[] = { 1 }; -static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */ -static int min_t3[] = { 1 }; -static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */ - -static int min_priority[1]; -static int max_priority[] = { 127 }; /* From DECnet spec */ - -static int dn_forwarding_proc(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -static struct dn_dev_sysctl_table { - struct ctl_table_header *sysctl_header; - struct ctl_table dn_dev_vars[5]; -} dn_dev_sysctl = { - NULL, - { - { - .procname = "forwarding", - .data = (void *)DN_DEV_PARMS_OFFSET(forwarding), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = dn_forwarding_proc, - }, - { - .procname = "priority", - .data = (void *)DN_DEV_PARMS_OFFSET(priority), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_priority, - .extra2 = &max_priority - }, - { - .procname = "t2", - .data = (void *)DN_DEV_PARMS_OFFSET(t2), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_t2, - .extra2 = &max_t2 - }, - { - .procname = "t3", - .data = (void *)DN_DEV_PARMS_OFFSET(t3), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_t3, - .extra2 = &max_t3 - }, - { } - }, -}; - -static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) -{ - struct dn_dev_sysctl_table *t; - int i; - - char path[sizeof("net/decnet/conf/") + IFNAMSIZ]; - - t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL); - if (t == NULL) - return; - - for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) { - long offset = (long)t->dn_dev_vars[i].data; - t->dn_dev_vars[i].data = ((char *)parms) + offset; - } - - snprintf(path, sizeof(path), "net/decnet/conf/%s", - dev? dev->name : parms->name); - - t->dn_dev_vars[0].extra1 = (void *)dev; - - t->sysctl_header = register_net_sysctl(&init_net, path, t->dn_dev_vars); - if (t->sysctl_header == NULL) - kfree(t); - else - parms->sysctl = t; -} - -static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) -{ - if (parms->sysctl) { - struct dn_dev_sysctl_table *t = parms->sysctl; - parms->sysctl = NULL; - unregister_net_sysctl_table(t->sysctl_header); - kfree(t); - } -} - -static int dn_forwarding_proc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ -#ifdef CONFIG_DECNET_ROUTER - struct net_device *dev = table->extra1; - struct dn_dev *dn_db; - int err; - int tmp, old; - - if (table->extra1 == NULL) - return -EINVAL; - - dn_db = rcu_dereference_raw(dev->dn_ptr); - old = dn_db->parms.forwarding; - - err = proc_dointvec(table, write, buffer, lenp, ppos); - - if ((err >= 0) && write) { - if (dn_db->parms.forwarding < 0) - dn_db->parms.forwarding = 0; - if (dn_db->parms.forwarding > 2) - dn_db->parms.forwarding = 2; - /* - * What an ugly hack this is... its works, just. It - * would be nice if sysctl/proc were just that little - * bit more flexible so I don't have to write a special - * routine, or suffer hacks like this - SJW - */ - tmp = dn_db->parms.forwarding; - dn_db->parms.forwarding = old; - if (dn_db->parms.down) - dn_db->parms.down(dev); - dn_db->parms.forwarding = tmp; - if (dn_db->parms.up) - dn_db->parms.up(dev); - } - - return err; -#else - return -EINVAL; -#endif -} - -#else /* CONFIG_SYSCTL */ -static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) -{ -} -static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) -{ -} - -#endif /* CONFIG_SYSCTL */ - -static inline __u16 mtu2blksize(struct net_device *dev) -{ - u32 blksize = dev->mtu; - if (blksize > 0xffff) - blksize = 0xffff; - - if (dev->type == ARPHRD_ETHER || - dev->type == ARPHRD_PPP || - dev->type == ARPHRD_IPGRE || - dev->type == ARPHRD_LOOPBACK) - blksize -= 2; - - return (__u16)blksize; -} - -static struct dn_ifaddr *dn_dev_alloc_ifa(void) -{ - struct dn_ifaddr *ifa; - - ifa = kzalloc(sizeof(*ifa), GFP_KERNEL); - - return ifa; -} - -static void dn_dev_free_ifa(struct dn_ifaddr *ifa) -{ - kfree_rcu(ifa, rcu); -} - -static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr __rcu **ifap, int destroy) -{ - struct dn_ifaddr *ifa1 = rtnl_dereference(*ifap); - unsigned char mac_addr[6]; - struct net_device *dev = dn_db->dev; - - ASSERT_RTNL(); - - *ifap = ifa1->ifa_next; - - if (dn_db->dev->type == ARPHRD_ETHER) { - if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) { - dn_dn2eth(mac_addr, ifa1->ifa_local); - dev_mc_del(dev, mac_addr); - } - } - - dn_ifaddr_notify(RTM_DELADDR, ifa1); - blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1); - if (destroy) { - dn_dev_free_ifa(ifa1); - - if (dn_db->ifa_list == NULL) - dn_dev_delete(dn_db->dev); - } -} - -static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa) -{ - struct net_device *dev = dn_db->dev; - struct dn_ifaddr *ifa1; - unsigned char mac_addr[6]; - - ASSERT_RTNL(); - - /* Check for duplicates */ - for (ifa1 = rtnl_dereference(dn_db->ifa_list); - ifa1 != NULL; - ifa1 = rtnl_dereference(ifa1->ifa_next)) { - if (ifa1->ifa_local == ifa->ifa_local) - return -EEXIST; - } - - if (dev->type == ARPHRD_ETHER) { - if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) { - dn_dn2eth(mac_addr, ifa->ifa_local); - dev_mc_add(dev, mac_addr); - } - } - - ifa->ifa_next = dn_db->ifa_list; - rcu_assign_pointer(dn_db->ifa_list, ifa); - - dn_ifaddr_notify(RTM_NEWADDR, ifa); - blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa); - - return 0; -} - -static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa) -{ - struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); - int rv; - - if (dn_db == NULL) { - int err; - dn_db = dn_dev_create(dev, &err); - if (dn_db == NULL) - return err; - } - - ifa->ifa_dev = dn_db; - - if (dev->flags & IFF_LOOPBACK) - ifa->ifa_scope = RT_SCOPE_HOST; - - rv = dn_dev_insert_ifa(dn_db, ifa); - if (rv) - dn_dev_free_ifa(ifa); - return rv; -} - - -int dn_dev_ioctl(unsigned int cmd, void __user *arg) -{ - char buffer[DN_IFREQ_SIZE]; - struct ifreq *ifr = (struct ifreq *)buffer; - struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr; - struct dn_dev *dn_db; - struct net_device *dev; - struct dn_ifaddr *ifa = NULL; - struct dn_ifaddr __rcu **ifap = NULL; - int ret = 0; - - if (copy_from_user(ifr, arg, DN_IFREQ_SIZE)) - return -EFAULT; - ifr->ifr_name[IFNAMSIZ-1] = 0; - - dev_load(&init_net, ifr->ifr_name); - - switch (cmd) { - case SIOCGIFADDR: - break; - case SIOCSIFADDR: - if (!capable(CAP_NET_ADMIN)) - return -EACCES; - if (sdn->sdn_family != AF_DECnet) - return -EINVAL; - break; - default: - return -EINVAL; - } - - rtnl_lock(); - - if ((dev = __dev_get_by_name(&init_net, ifr->ifr_name)) == NULL) { - ret = -ENODEV; - goto done; - } - - if ((dn_db = rtnl_dereference(dev->dn_ptr)) != NULL) { - for (ifap = &dn_db->ifa_list; - (ifa = rtnl_dereference(*ifap)) != NULL; - ifap = &ifa->ifa_next) - if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0) - break; - } - - if (ifa == NULL && cmd != SIOCSIFADDR) { - ret = -EADDRNOTAVAIL; - goto done; - } - - switch (cmd) { - case SIOCGIFADDR: - *((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local; - goto rarok; - - case SIOCSIFADDR: - if (!ifa) { - if ((ifa = dn_dev_alloc_ifa()) == NULL) { - ret = -ENOBUFS; - break; - } - memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); - } else { - if (ifa->ifa_local == dn_saddr2dn(sdn)) - break; - dn_dev_del_ifa(dn_db, ifap, 0); - } - - ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn); - - ret = dn_dev_set_ifa(dev, ifa); - } -done: - rtnl_unlock(); - - return ret; -rarok: - if (copy_to_user(arg, ifr, DN_IFREQ_SIZE)) - ret = -EFAULT; - goto done; -} - -struct net_device *dn_dev_get_default(void) -{ - struct net_device *dev; - - spin_lock(&dndev_lock); - dev = decnet_default_device; - if (dev) { - if (dev->dn_ptr) - dev_hold(dev); - else - dev = NULL; - } - spin_unlock(&dndev_lock); - - return dev; -} - -int dn_dev_set_default(struct net_device *dev, int force) -{ - struct net_device *old = NULL; - int rv = -EBUSY; - if (!dev->dn_ptr) - return -ENODEV; - - spin_lock(&dndev_lock); - if (force || decnet_default_device == NULL) { - old = decnet_default_device; - decnet_default_device = dev; - rv = 0; - } - spin_unlock(&dndev_lock); - - if (old) - dev_put(old); - return rv; -} - -static void dn_dev_check_default(struct net_device *dev) -{ - spin_lock(&dndev_lock); - if (dev == decnet_default_device) { - decnet_default_device = NULL; - } else { - dev = NULL; - } - spin_unlock(&dndev_lock); - - if (dev) - dev_put(dev); -} - -/* - * Called with RTNL - */ -static struct dn_dev *dn_dev_by_index(int ifindex) -{ - struct net_device *dev; - struct dn_dev *dn_dev = NULL; - - dev = __dev_get_by_index(&init_net, ifindex); - if (dev) - dn_dev = rtnl_dereference(dev->dn_ptr); - - return dn_dev; -} - -static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = { - [IFA_ADDRESS] = { .type = NLA_U16 }, - [IFA_LOCAL] = { .type = NLA_U16 }, - [IFA_LABEL] = { .type = NLA_STRING, - .len = IFNAMSIZ - 1 }, - [IFA_FLAGS] = { .type = NLA_U32 }, -}; - -static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(skb->sk); - struct nlattr *tb[IFA_MAX+1]; - struct dn_dev *dn_db; - struct ifaddrmsg *ifm; - struct dn_ifaddr *ifa; - struct dn_ifaddr __rcu **ifap; - int err = -EINVAL; - - if (!netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if (!net_eq(net, &init_net)) - goto errout; - - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, - dn_ifa_policy, extack); - if (err < 0) - goto errout; - - err = -ENODEV; - ifm = nlmsg_data(nlh); - if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL) - goto errout; - - err = -EADDRNOTAVAIL; - for (ifap = &dn_db->ifa_list; - (ifa = rtnl_dereference(*ifap)) != NULL; - ifap = &ifa->ifa_next) { - if (tb[IFA_LOCAL] && - nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2)) - continue; - - if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) - continue; - - dn_dev_del_ifa(dn_db, ifap, 1); - return 0; - } - -errout: - return err; -} - -static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(skb->sk); - struct nlattr *tb[IFA_MAX+1]; - struct net_device *dev; - struct dn_dev *dn_db; - struct ifaddrmsg *ifm; - struct dn_ifaddr *ifa; - int err; - - if (!netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if (!net_eq(net, &init_net)) - return -EINVAL; - - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, - dn_ifa_policy, extack); - if (err < 0) - return err; - - if (tb[IFA_LOCAL] == NULL) - return -EINVAL; - - ifm = nlmsg_data(nlh); - if ((dev = __dev_get_by_index(&init_net, ifm->ifa_index)) == NULL) - return -ENODEV; - - if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) { - dn_db = dn_dev_create(dev, &err); - if (!dn_db) - return err; - } - - if ((ifa = dn_dev_alloc_ifa()) == NULL) - return -ENOBUFS; - - if (tb[IFA_ADDRESS] == NULL) - tb[IFA_ADDRESS] = tb[IFA_LOCAL]; - - ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]); - ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]); - ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : - ifm->ifa_flags; - ifa->ifa_scope = ifm->ifa_scope; - ifa->ifa_dev = dn_db; - - if (tb[IFA_LABEL]) - nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); - else - memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); - - err = dn_dev_insert_ifa(dn_db, ifa); - if (err) - dn_dev_free_ifa(ifa); - - return err; -} - -static inline size_t dn_ifaddr_nlmsg_size(void) -{ - return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) - + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ - + nla_total_size(2) /* IFA_ADDRESS */ - + nla_total_size(2) /* IFA_LOCAL */ - + nla_total_size(4); /* IFA_FLAGS */ -} - -static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, - u32 portid, u32 seq, int event, unsigned int flags) -{ - struct ifaddrmsg *ifm; - struct nlmsghdr *nlh; - u32 ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT; - - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); - if (nlh == NULL) - return -EMSGSIZE; - - ifm = nlmsg_data(nlh); - ifm->ifa_family = AF_DECnet; - ifm->ifa_prefixlen = 16; - ifm->ifa_flags = ifa_flags; - ifm->ifa_scope = ifa->ifa_scope; - ifm->ifa_index = ifa->ifa_dev->dev->ifindex; - - if ((ifa->ifa_address && - nla_put_le16(skb, IFA_ADDRESS, ifa->ifa_address)) || - (ifa->ifa_local && - nla_put_le16(skb, IFA_LOCAL, ifa->ifa_local)) || - (ifa->ifa_label[0] && - nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || - nla_put_u32(skb, IFA_FLAGS, ifa_flags)) - goto nla_put_failure; - nlmsg_end(skb, nlh); - return 0; - -nla_put_failure: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - -static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa) -{ - struct sk_buff *skb; - int err = -ENOBUFS; - - skb = alloc_skb(dn_ifaddr_nlmsg_size(), GFP_KERNEL); - if (skb == NULL) - goto errout; - - err = dn_nl_fill_ifaddr(skb, ifa, 0, 0, event, 0); - if (err < 0) { - /* -EMSGSIZE implies BUG in dn_ifaddr_nlmsg_size() */ - WARN_ON(err == -EMSGSIZE); - kfree_skb(skb); - goto errout; - } - rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); - return; -errout: - if (err < 0) - rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err); -} - -static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - int idx, dn_idx = 0, skip_ndevs, skip_naddr; - struct net_device *dev; - struct dn_dev *dn_db; - struct dn_ifaddr *ifa; - - if (!net_eq(net, &init_net)) - return 0; - - skip_ndevs = cb->args[0]; - skip_naddr = cb->args[1]; - - idx = 0; - rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) { - if (idx < skip_ndevs) - goto cont; - else if (idx > skip_ndevs) { - /* Only skip over addresses for first dev dumped - * in this iteration (idx == skip_ndevs) */ - skip_naddr = 0; - } - - if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL) - goto cont; - - for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa; - ifa = rcu_dereference(ifa->ifa_next), dn_idx++) { - if (dn_idx < skip_naddr) - continue; - - if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWADDR, - NLM_F_MULTI) < 0) - goto done; - } -cont: - idx++; - } -done: - rcu_read_unlock(); - cb->args[0] = idx; - cb->args[1] = dn_idx; - - return skb->len; -} - -static int dn_dev_get_first(struct net_device *dev, __le16 *addr) -{ - struct dn_dev *dn_db; - struct dn_ifaddr *ifa; - int rv = -ENODEV; - - rcu_read_lock(); - dn_db = rcu_dereference(dev->dn_ptr); - if (dn_db == NULL) - goto out; - - ifa = rcu_dereference(dn_db->ifa_list); - if (ifa != NULL) { - *addr = ifa->ifa_local; - rv = 0; - } -out: - rcu_read_unlock(); - return rv; -} - -/* - * Find a default address to bind to. - * - * This is one of those areas where the initial VMS concepts don't really - * map onto the Linux concepts, and since we introduced multiple addresses - * per interface we have to cope with slightly odd ways of finding out what - * "our address" really is. Mostly it's not a problem; for this we just guess - * a sensible default. Eventually the routing code will take care of all the - * nasties for us I hope. - */ -int dn_dev_bind_default(__le16 *addr) -{ - struct net_device *dev; - int rv; - dev = dn_dev_get_default(); -last_chance: - if (dev) { - rv = dn_dev_get_first(dev, addr); - dev_put(dev); - if (rv == 0 || dev == init_net.loopback_dev) - return rv; - } - dev = init_net.loopback_dev; - dev_hold(dev); - goto last_chance; -} - -static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) -{ - struct endnode_hello_message *msg; - struct sk_buff *skb = NULL; - __le16 *pktlen; - struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - - if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL) - return; - - skb->dev = dev; - - msg = skb_put(skb, sizeof(*msg)); - - msg->msgflg = 0x0D; - memcpy(msg->tiver, dn_eco_version, 3); - dn_dn2eth(msg->id, ifa->ifa_local); - msg->iinfo = DN_RT_INFO_ENDN; - msg->blksize = cpu_to_le16(mtu2blksize(dev)); - msg->area = 0x00; - memset(msg->seed, 0, 8); - memcpy(msg->neighbor, dn_hiord, ETH_ALEN); - - if (dn_db->router) { - struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; - dn_dn2eth(msg->neighbor, dn->addr); - } - - msg->timer = cpu_to_le16((unsigned short)dn_db->parms.t3); - msg->mpd = 0x00; - msg->datalen = 0x02; - memset(msg->data, 0xAA, 2); - - pktlen = skb_push(skb, 2); - *pktlen = cpu_to_le16(skb->len - 2); - - skb_reset_network_header(skb); - - dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id); -} - - -#define DRDELAY (5 * HZ) - -static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa) -{ - /* First check time since device went up */ - if (time_before(jiffies, dn_db->uptime + DRDELAY)) - return 0; - - /* If there is no router, then yes... */ - if (!dn_db->router) - return 1; - - /* otherwise only if we have a higher priority or.. */ - if (dn->priority < dn_db->parms.priority) - return 1; - - /* if we have equal priority and a higher node number */ - if (dn->priority != dn_db->parms.priority) - return 0; - - if (le16_to_cpu(dn->addr) < le16_to_cpu(ifa->ifa_local)) - return 1; - - return 0; -} - -static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa) -{ - int n; - struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; - struct sk_buff *skb; - size_t size; - unsigned char *ptr; - unsigned char *i1, *i2; - __le16 *pktlen; - char *src; - - if (mtu2blksize(dev) < (26 + 7)) - return; - - n = mtu2blksize(dev) - 26; - n /= 7; - - if (n > 32) - n = 32; - - size = 2 + 26 + 7 * n; - - if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL) - return; - - skb->dev = dev; - ptr = skb_put(skb, size); - - *ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH; - *ptr++ = 2; /* ECO */ - *ptr++ = 0; - *ptr++ = 0; - dn_dn2eth(ptr, ifa->ifa_local); - src = ptr; - ptr += ETH_ALEN; - *ptr++ = dn_db->parms.forwarding == 1 ? - DN_RT_INFO_L1RT : DN_RT_INFO_L2RT; - *((__le16 *)ptr) = cpu_to_le16(mtu2blksize(dev)); - ptr += 2; - *ptr++ = dn_db->parms.priority; /* Priority */ - *ptr++ = 0; /* Area: Reserved */ - *((__le16 *)ptr) = cpu_to_le16((unsigned short)dn_db->parms.t3); - ptr += 2; - *ptr++ = 0; /* MPD: Reserved */ - i1 = ptr++; - memset(ptr, 0, 7); /* Name: Reserved */ - ptr += 7; - i2 = ptr++; - - n = dn_neigh_elist(dev, ptr, n); - - *i2 = 7 * n; - *i1 = 8 + *i2; - - skb_trim(skb, (27 + *i2)); - - pktlen = skb_push(skb, 2); - *pktlen = cpu_to_le16(skb->len - 2); - - skb_reset_network_header(skb); - - if (dn_am_i_a_router(dn, dn_db, ifa)) { - struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC); - if (skb2) { - dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src); - } - } - - dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src); -} - -static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa) -{ - struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - - if (dn_db->parms.forwarding == 0) - dn_send_endnode_hello(dev, ifa); - else - dn_send_router_hello(dev, ifa); -} - -static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa) -{ - int tdlen = 16; - int size = dev->hard_header_len + 2 + 4 + tdlen; - struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC); - int i; - unsigned char *ptr; - char src[ETH_ALEN]; - - if (skb == NULL) - return ; - - skb->dev = dev; - skb_push(skb, dev->hard_header_len); - ptr = skb_put(skb, 2 + 4 + tdlen); - - *ptr++ = DN_RT_PKT_HELO; - *((__le16 *)ptr) = ifa->ifa_local; - ptr += 2; - *ptr++ = tdlen; - - for(i = 0; i < tdlen; i++) - *ptr++ = 0252; - - dn_dn2eth(src, ifa->ifa_local); - dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src); -} - -static int dn_eth_up(struct net_device *dev) -{ - struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - - if (dn_db->parms.forwarding == 0) - dev_mc_add(dev, dn_rt_all_end_mcast); - else - dev_mc_add(dev, dn_rt_all_rt_mcast); - - dn_db->use_long = 1; - - return 0; -} - -static void dn_eth_down(struct net_device *dev) -{ - struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - - if (dn_db->parms.forwarding == 0) - dev_mc_del(dev, dn_rt_all_end_mcast); - else - dev_mc_del(dev, dn_rt_all_rt_mcast); -} - -static void dn_dev_set_timer(struct net_device *dev); - -static void dn_dev_timer_func(struct timer_list *t) -{ - struct dn_dev *dn_db = from_timer(dn_db, t, timer); - struct net_device *dev; - struct dn_ifaddr *ifa; - - rcu_read_lock(); - dev = dn_db->dev; - if (dn_db->t3 <= dn_db->parms.t2) { - if (dn_db->parms.timer3) { - for (ifa = rcu_dereference(dn_db->ifa_list); - ifa; - ifa = rcu_dereference(ifa->ifa_next)) { - if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - dn_db->parms.timer3(dev, ifa); - } - } - dn_db->t3 = dn_db->parms.t3; - } else { - dn_db->t3 -= dn_db->parms.t2; - } - rcu_read_unlock(); - dn_dev_set_timer(dev); -} - -static void dn_dev_set_timer(struct net_device *dev) -{ - struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - - if (dn_db->parms.t2 > dn_db->parms.t3) - dn_db->parms.t2 = dn_db->parms.t3; - - dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ); - - add_timer(&dn_db->timer); -} - -static struct dn_dev *dn_dev_create(struct net_device *dev, int *err) -{ - int i; - struct dn_dev_parms *p = dn_dev_list; - struct dn_dev *dn_db; - - for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) { - if (p->type == dev->type) - break; - } - - *err = -ENODEV; - if (i == DN_DEV_LIST_SIZE) - return NULL; - - *err = -ENOBUFS; - if ((dn_db = kzalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL) - return NULL; - - memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms)); - - rcu_assign_pointer(dev->dn_ptr, dn_db); - dn_db->dev = dev; - timer_setup(&dn_db->timer, dn_dev_timer_func, 0); - - dn_db->uptime = jiffies; - - dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table); - if (!dn_db->neigh_parms) { - RCU_INIT_POINTER(dev->dn_ptr, NULL); - kfree(dn_db); - return NULL; - } - - if (dn_db->parms.up) { - if (dn_db->parms.up(dev) < 0) { - neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); - dev->dn_ptr = NULL; - kfree(dn_db); - return NULL; - } - } - - dn_dev_sysctl_register(dev, &dn_db->parms); - - dn_dev_set_timer(dev); - - *err = 0; - return dn_db; -} - - -/* - * This processes a device up event. We only start up - * the loopback device & ethernet devices with correct - * MAC addresses automatically. Others must be started - * specifically. - * - * FIXME: How should we configure the loopback address ? If we could dispense - * with using decnet_address here and for autobind, it will be one less thing - * for users to worry about setting up. - */ - -void dn_dev_up(struct net_device *dev) -{ - struct dn_ifaddr *ifa; - __le16 addr = decnet_address; - int maybe_default = 0; - struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); - - if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) - return; - - /* - * Need to ensure that loopback device has a dn_db attached to it - * to allow creation of neighbours against it, even though it might - * not have a local address of its own. Might as well do the same for - * all autoconfigured interfaces. - */ - if (dn_db == NULL) { - int err; - dn_db = dn_dev_create(dev, &err); - if (dn_db == NULL) - return; - } - - if (dev->type == ARPHRD_ETHER) { - if (memcmp(dev->dev_addr, dn_hiord, 4) != 0) - return; - addr = dn_eth2dn(dev->dev_addr); - maybe_default = 1; - } - - if (addr == 0) - return; - - if ((ifa = dn_dev_alloc_ifa()) == NULL) - return; - - ifa->ifa_local = ifa->ifa_address = addr; - ifa->ifa_flags = 0; - ifa->ifa_scope = RT_SCOPE_UNIVERSE; - strcpy(ifa->ifa_label, dev->name); - - dn_dev_set_ifa(dev, ifa); - - /* - * Automagically set the default device to the first automatically - * configured ethernet card in the system. - */ - if (maybe_default) { - dev_hold(dev); - if (dn_dev_set_default(dev, 0)) - dev_put(dev); - } -} - -static void dn_dev_delete(struct net_device *dev) -{ - struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); - - if (dn_db == NULL) - return; - - del_timer_sync(&dn_db->timer); - dn_dev_sysctl_unregister(&dn_db->parms); - dn_dev_check_default(dev); - neigh_ifdown(&dn_neigh_table, dev); - - if (dn_db->parms.down) - dn_db->parms.down(dev); - - dev->dn_ptr = NULL; - - neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); - neigh_ifdown(&dn_neigh_table, dev); - - if (dn_db->router) - neigh_release(dn_db->router); - if (dn_db->peer) - neigh_release(dn_db->peer); - - kfree(dn_db); -} - -void dn_dev_down(struct net_device *dev) -{ - struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); - struct dn_ifaddr *ifa; - - if (dn_db == NULL) - return; - - while ((ifa = rtnl_dereference(dn_db->ifa_list)) != NULL) { - dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0); - dn_dev_free_ifa(ifa); - } - - dn_dev_delete(dev); -} - -void dn_dev_init_pkt(struct sk_buff *skb) -{ -} - -void dn_dev_veri_pkt(struct sk_buff *skb) -{ -} - -void dn_dev_hello(struct sk_buff *skb) -{ -} - -void dn_dev_devices_off(void) -{ - struct net_device *dev; - - rtnl_lock(); - for_each_netdev(&init_net, dev) - dn_dev_down(dev); - rtnl_unlock(); - -} - -void dn_dev_devices_on(void) -{ - struct net_device *dev; - - rtnl_lock(); - for_each_netdev(&init_net, dev) { - if (dev->flags & IFF_UP) - dn_dev_up(dev); - } - rtnl_unlock(); -} - -int register_dnaddr_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&dnaddr_chain, nb); -} - -int unregister_dnaddr_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&dnaddr_chain, nb); -} - -#ifdef CONFIG_PROC_FS -static inline int is_dn_dev(struct net_device *dev) -{ - return dev->dn_ptr != NULL; -} - -static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - int i; - struct net_device *dev; - - rcu_read_lock(); - - if (*pos == 0) - return SEQ_START_TOKEN; - - i = 1; - for_each_netdev_rcu(&init_net, dev) { - if (!is_dn_dev(dev)) - continue; - - if (i++ == *pos) - return dev; - } - - return NULL; -} - -static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct net_device *dev; - - ++*pos; - - dev = v; - if (v == SEQ_START_TOKEN) - dev = net_device_entry(&init_net.dev_base_head); - - for_each_netdev_continue_rcu(&init_net, dev) { - if (!is_dn_dev(dev)) - continue; - - return dev; - } - - return NULL; -} - -static void dn_dev_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static char *dn_type2asc(char type) -{ - switch (type) { - case DN_DEV_BCAST: - return "B"; - case DN_DEV_UCAST: - return "U"; - case DN_DEV_MPOINT: - return "M"; - } - - return "?"; -} - -static int dn_dev_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Name Flags T1 Timer1 T3 Timer3 BlkSize Pri State DevType Router Peer\n"); - else { - struct net_device *dev = v; - char peer_buf[DN_ASCBUF_LEN]; - char router_buf[DN_ASCBUF_LEN]; - struct dn_dev *dn_db = rcu_dereference(dev->dn_ptr); - - seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu" - " %04hu %03d %02x %-10s %-7s %-7s\n", - dev->name, - dn_type2asc(dn_db->parms.mode), - 0, 0, - dn_db->t3, dn_db->parms.t3, - mtu2blksize(dev), - dn_db->parms.priority, - dn_db->parms.state, dn_db->parms.name, - dn_db->router ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->router->primary_key), router_buf) : "", - dn_db->peer ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->peer->primary_key), peer_buf) : ""); - } - return 0; -} - -static const struct seq_operations dn_dev_seq_ops = { - .start = dn_dev_seq_start, - .next = dn_dev_seq_next, - .stop = dn_dev_seq_stop, - .show = dn_dev_seq_show, -}; -#endif /* CONFIG_PROC_FS */ - -static int addr[2]; -module_param_array(addr, int, NULL, 0444); -MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node"); - -void __init dn_dev_init(void) -{ - if (addr[0] > 63 || addr[0] < 0) { - printk(KERN_ERR "DECnet: Area must be between 0 and 63"); - return; - } - - if (addr[1] > 1023 || addr[1] < 0) { - printk(KERN_ERR "DECnet: Node must be between 0 and 1023"); - return; - } - - decnet_address = cpu_to_le16((addr[0] << 10) | addr[1]); - - dn_dev_devices_on(); - - rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR, - dn_nl_newaddr, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR, - dn_nl_deladdr, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR, - NULL, dn_nl_dump_ifaddr, 0); - - proc_create_seq("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_ops); - -#ifdef CONFIG_SYSCTL - { - int i; - for(i = 0; i < DN_DEV_LIST_SIZE; i++) - dn_dev_sysctl_register(NULL, &dn_dev_list[i]); - } -#endif /* CONFIG_SYSCTL */ -} - -void __exit dn_dev_cleanup(void) -{ -#ifdef CONFIG_SYSCTL - { - int i; - for(i = 0; i < DN_DEV_LIST_SIZE; i++) - dn_dev_sysctl_unregister(&dn_dev_list[i]); - } -#endif /* CONFIG_SYSCTL */ - - remove_proc_entry("decnet_dev", init_net.proc_net); - - dn_dev_devices_off(); -} diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c deleted file mode 100644 index 77fbf8e9df4b..000000000000 --- a/net/decnet/dn_fib.c +++ /dev/null @@ -1,799 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Routing Forwarding Information Base (Glue/Info List) - * - * Author: Steve Whitehouse <SteveW@ACM.org> - * - * - * Changes: - * Alexey Kuznetsov : SMP locking changes - * Steve Whitehouse : Rewrote it... Well to be more correct, I - * copied most of it from the ipv4 fib code. - * Steve Whitehouse : Updated it in style and fixed a few bugs - * which were fixed in the ipv4 code since - * this code was copied from it. - * - */ -#include <linux/string.h> -#include <linux/net.h> -#include <linux/socket.h> -#include <linux/slab.h> -#include <linux/sockios.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/netdevice.h> -#include <linux/timer.h> -#include <linux/spinlock.h> -#include <linux/atomic.h> -#include <linux/uaccess.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/fib_rules.h> -#include <net/dn.h> -#include <net/dn_route.h> -#include <net/dn_fib.h> -#include <net/dn_neigh.h> -#include <net/dn_dev.h> -#include <net/rtnh.h> - -#define RT_MIN_TABLE 1 - -#define for_fib_info() { struct dn_fib_info *fi;\ - for(fi = dn_fib_info_list; fi; fi = fi->fib_next) -#define endfor_fib_info() } - -#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ - for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) - -#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\ - for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) - -#define endfor_nexthops(fi) } - -static DEFINE_SPINLOCK(dn_fib_multipath_lock); -static struct dn_fib_info *dn_fib_info_list; -static DEFINE_SPINLOCK(dn_fib_info_lock); - -static struct -{ - int error; - u8 scope; -} dn_fib_props[RTN_MAX+1] = { - [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE }, - [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE }, - [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST }, - [RTN_BROADCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, - [RTN_ANYCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, - [RTN_MULTICAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, - [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE }, - [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE }, - [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE }, - [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE }, - [RTN_NAT] = { .error = 0, .scope = RT_SCOPE_NOWHERE }, - [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, -}; - -static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force); -static int dn_fib_sync_up(struct net_device *dev); - -void dn_fib_free_info(struct dn_fib_info *fi) -{ - if (fi->fib_dead == 0) { - printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n"); - return; - } - - change_nexthops(fi) { - if (nh->nh_dev) - dev_put(nh->nh_dev); - nh->nh_dev = NULL; - } endfor_nexthops(fi); - kfree(fi); -} - -void dn_fib_release_info(struct dn_fib_info *fi) -{ - spin_lock(&dn_fib_info_lock); - if (fi && --fi->fib_treeref == 0) { - if (fi->fib_next) - fi->fib_next->fib_prev = fi->fib_prev; - if (fi->fib_prev) - fi->fib_prev->fib_next = fi->fib_next; - if (fi == dn_fib_info_list) - dn_fib_info_list = fi->fib_next; - fi->fib_dead = 1; - dn_fib_info_put(fi); - } - spin_unlock(&dn_fib_info_lock); -} - -static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi) -{ - const struct dn_fib_nh *onh = ofi->fib_nh; - - for_nexthops(fi) { - if (nh->nh_oif != onh->nh_oif || - nh->nh_gw != onh->nh_gw || - nh->nh_scope != onh->nh_scope || - nh->nh_weight != onh->nh_weight || - ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) - return -1; - onh++; - } endfor_nexthops(fi); - return 0; -} - -static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi) -{ - for_fib_info() { - if (fi->fib_nhs != nfi->fib_nhs) - continue; - if (nfi->fib_protocol == fi->fib_protocol && - nfi->fib_prefsrc == fi->fib_prefsrc && - nfi->fib_priority == fi->fib_priority && - memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 && - ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && - (nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0)) - return fi; - } endfor_fib_info(); - return NULL; -} - -static int dn_fib_count_nhs(const struct nlattr *attr) -{ - struct rtnexthop *nhp = nla_data(attr); - int nhs = 0, nhlen = nla_len(attr); - - while (rtnh_ok(nhp, nhlen)) { - nhs++; - nhp = rtnh_next(nhp, &nhlen); - } - - /* leftover implies invalid nexthop configuration, discard it */ - return nhlen > 0 ? 0 : nhs; -} - -static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr, - const struct rtmsg *r) -{ - struct rtnexthop *nhp = nla_data(attr); - int nhlen = nla_len(attr); - - change_nexthops(fi) { - int attrlen; - - if (!rtnh_ok(nhp, nhlen)) - return -EINVAL; - - nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; - nh->nh_oif = nhp->rtnh_ifindex; - nh->nh_weight = nhp->rtnh_hops + 1; - - attrlen = rtnh_attrlen(nhp); - if (attrlen > 0) { - struct nlattr *gw_attr; - - gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY); - nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0; - } - - nhp = rtnh_next(nhp, &nhlen); - } endfor_nexthops(fi); - - return 0; -} - - -static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh) -{ - int err; - - if (nh->nh_gw) { - struct flowidn fld; - struct dn_fib_res res; - - if (nh->nh_flags&RTNH_F_ONLINK) { - struct net_device *dev; - - if (r->rtm_scope >= RT_SCOPE_LINK) - return -EINVAL; - if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST) - return -EINVAL; - if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL) - return -ENODEV; - if (!(dev->flags&IFF_UP)) - return -ENETDOWN; - nh->nh_dev = dev; - dev_hold(dev); - nh->nh_scope = RT_SCOPE_LINK; - return 0; - } - - memset(&fld, 0, sizeof(fld)); - fld.daddr = nh->nh_gw; - fld.flowidn_oif = nh->nh_oif; - fld.flowidn_scope = r->rtm_scope + 1; - - if (fld.flowidn_scope < RT_SCOPE_LINK) - fld.flowidn_scope = RT_SCOPE_LINK; - - if ((err = dn_fib_lookup(&fld, &res)) != 0) - return err; - - err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) - goto out; - nh->nh_scope = res.scope; - nh->nh_oif = DN_FIB_RES_OIF(res); - nh->nh_dev = DN_FIB_RES_DEV(res); - if (nh->nh_dev == NULL) - goto out; - dev_hold(nh->nh_dev); - err = -ENETDOWN; - if (!(nh->nh_dev->flags & IFF_UP)) - goto out; - err = 0; -out: - dn_fib_res_put(&res); - return err; - } else { - struct net_device *dev; - - if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) - return -EINVAL; - - dev = __dev_get_by_index(&init_net, nh->nh_oif); - if (dev == NULL || dev->dn_ptr == NULL) - return -ENODEV; - if (!(dev->flags&IFF_UP)) - return -ENETDOWN; - nh->nh_dev = dev; - dev_hold(nh->nh_dev); - nh->nh_scope = RT_SCOPE_HOST; - } - - return 0; -} - - -struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *attrs[], - const struct nlmsghdr *nlh, int *errp) -{ - int err; - struct dn_fib_info *fi = NULL; - struct dn_fib_info *ofi; - int nhs = 1; - - if (r->rtm_type > RTN_MAX) - goto err_inval; - - if (dn_fib_props[r->rtm_type].scope > r->rtm_scope) - goto err_inval; - - if (attrs[RTA_MULTIPATH] && - (nhs = dn_fib_count_nhs(attrs[RTA_MULTIPATH])) == 0) - goto err_inval; - - fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); - err = -ENOBUFS; - if (fi == NULL) - goto failure; - - fi->fib_protocol = r->rtm_protocol; - fi->fib_nhs = nhs; - fi->fib_flags = r->rtm_flags; - - if (attrs[RTA_PRIORITY]) - fi->fib_priority = nla_get_u32(attrs[RTA_PRIORITY]); - - if (attrs[RTA_METRICS]) { - struct nlattr *attr; - int rem; - - nla_for_each_nested(attr, attrs[RTA_METRICS], rem) { - int type = nla_type(attr); - - if (type) { - if (type > RTAX_MAX || type == RTAX_CC_ALGO || - nla_len(attr) < 4) - goto err_inval; - - fi->fib_metrics[type-1] = nla_get_u32(attr); - } - } - } - - if (attrs[RTA_PREFSRC]) - fi->fib_prefsrc = nla_get_le16(attrs[RTA_PREFSRC]); - - if (attrs[RTA_MULTIPATH]) { - if ((err = dn_fib_get_nhs(fi, attrs[RTA_MULTIPATH], r)) != 0) - goto failure; - - if (attrs[RTA_OIF] && - fi->fib_nh->nh_oif != nla_get_u32(attrs[RTA_OIF])) - goto err_inval; - - if (attrs[RTA_GATEWAY] && - fi->fib_nh->nh_gw != nla_get_le16(attrs[RTA_GATEWAY])) - goto err_inval; - } else { - struct dn_fib_nh *nh = fi->fib_nh; - - if (attrs[RTA_OIF]) - nh->nh_oif = nla_get_u32(attrs[RTA_OIF]); - - if (attrs[RTA_GATEWAY]) - nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]); - - nh->nh_flags = r->rtm_flags; - nh->nh_weight = 1; - } - - if (r->rtm_type == RTN_NAT) { - if (!attrs[RTA_GATEWAY] || nhs != 1 || attrs[RTA_OIF]) - goto err_inval; - - fi->fib_nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]); - goto link_it; - } - - if (dn_fib_props[r->rtm_type].error) { - if (attrs[RTA_GATEWAY] || attrs[RTA_OIF] || attrs[RTA_MULTIPATH]) - goto err_inval; - - goto link_it; - } - - if (r->rtm_scope > RT_SCOPE_HOST) - goto err_inval; - - if (r->rtm_scope == RT_SCOPE_HOST) { - struct dn_fib_nh *nh = fi->fib_nh; - - /* Local address is added */ - if (nhs != 1 || nh->nh_gw) - goto err_inval; - nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif); - err = -ENODEV; - if (nh->nh_dev == NULL) - goto failure; - } else { - change_nexthops(fi) { - if ((err = dn_fib_check_nh(r, fi, nh)) != 0) - goto failure; - } endfor_nexthops(fi) - } - - if (fi->fib_prefsrc) { - if (r->rtm_type != RTN_LOCAL || !attrs[RTA_DST] || - fi->fib_prefsrc != nla_get_le16(attrs[RTA_DST])) - if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) - goto err_inval; - } - -link_it: - if ((ofi = dn_fib_find_info(fi)) != NULL) { - fi->fib_dead = 1; - dn_fib_free_info(fi); - ofi->fib_treeref++; - return ofi; - } - - fi->fib_treeref++; - refcount_set(&fi->fib_clntref, 1); - spin_lock(&dn_fib_info_lock); - fi->fib_next = dn_fib_info_list; - fi->fib_prev = NULL; - if (dn_fib_info_list) - dn_fib_info_list->fib_prev = fi; - dn_fib_info_list = fi; - spin_unlock(&dn_fib_info_lock); - return fi; - -err_inval: - err = -EINVAL; - -failure: - *errp = err; - if (fi) { - fi->fib_dead = 1; - dn_fib_free_info(fi); - } - - return NULL; -} - -int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowidn *fld, struct dn_fib_res *res) -{ - int err = dn_fib_props[type].error; - - if (err == 0) { - if (fi->fib_flags & RTNH_F_DEAD) - return 1; - - res->fi = fi; - - switch (type) { - case RTN_NAT: - DN_FIB_RES_RESET(*res); - refcount_inc(&fi->fib_clntref); - return 0; - case RTN_UNICAST: - case RTN_LOCAL: - for_nexthops(fi) { - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (!fld->flowidn_oif || - fld->flowidn_oif == nh->nh_oif) - break; - } - if (nhsel < fi->fib_nhs) { - res->nh_sel = nhsel; - refcount_inc(&fi->fib_clntref); - return 0; - } - endfor_nexthops(fi); - res->fi = NULL; - return 1; - default: - net_err_ratelimited("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n", - type); - res->fi = NULL; - return -EINVAL; - } - } - return err; -} - -void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res) -{ - struct dn_fib_info *fi = res->fi; - int w; - - spin_lock_bh(&dn_fib_multipath_lock); - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { - power += nh->nh_weight; - nh->nh_power = nh->nh_weight; - } - } endfor_nexthops(fi); - fi->fib_power = power; - if (power < 0) { - spin_unlock_bh(&dn_fib_multipath_lock); - res->nh_sel = 0; - return; - } - } - - w = jiffies % fi->fib_power; - - change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { - if ((w -= nh->nh_power) <= 0) { - nh->nh_power--; - fi->fib_power--; - res->nh_sel = nhsel; - spin_unlock_bh(&dn_fib_multipath_lock); - return; - } - } - } endfor_nexthops(fi); - res->nh_sel = 0; - spin_unlock_bh(&dn_fib_multipath_lock); -} - -static inline u32 rtm_get_table(struct nlattr *attrs[], u8 table) -{ - if (attrs[RTA_TABLE]) - table = nla_get_u32(attrs[RTA_TABLE]); - - return table; -} - -static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(skb->sk); - struct dn_fib_table *tb; - struct rtmsg *r = nlmsg_data(nlh); - struct nlattr *attrs[RTA_MAX+1]; - int err; - - if (!netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if (!net_eq(net, &init_net)) - return -EINVAL; - - err = nlmsg_parse_deprecated(nlh, sizeof(*r), attrs, RTA_MAX, - rtm_dn_policy, extack); - if (err < 0) - return err; - - tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 0); - if (!tb) - return -ESRCH; - - return tb->delete(tb, r, attrs, nlh, &NETLINK_CB(skb)); -} - -static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(skb->sk); - struct dn_fib_table *tb; - struct rtmsg *r = nlmsg_data(nlh); - struct nlattr *attrs[RTA_MAX+1]; - int err; - - if (!netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - - if (!net_eq(net, &init_net)) - return -EINVAL; - - err = nlmsg_parse_deprecated(nlh, sizeof(*r), attrs, RTA_MAX, - rtm_dn_policy, extack); - if (err < 0) - return err; - - tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 1); - if (!tb) - return -ENOBUFS; - - return tb->insert(tb, r, attrs, nlh, &NETLINK_CB(skb)); -} - -static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa) -{ - struct dn_fib_table *tb; - struct { - struct nlmsghdr nlh; - struct rtmsg rtm; - } req; - struct { - struct nlattr hdr; - __le16 dst; - } dst_attr = { - .dst = dst, - }; - struct { - struct nlattr hdr; - __le16 prefsrc; - } prefsrc_attr = { - .prefsrc = ifa->ifa_local, - }; - struct { - struct nlattr hdr; - u32 oif; - } oif_attr = { - .oif = ifa->ifa_dev->dev->ifindex, - }; - struct nlattr *attrs[RTA_MAX+1] = { - [RTA_DST] = (struct nlattr *) &dst_attr, - [RTA_PREFSRC] = (struct nlattr * ) &prefsrc_attr, - [RTA_OIF] = (struct nlattr *) &oif_attr, - }; - - memset(&req.rtm, 0, sizeof(req.rtm)); - - if (type == RTN_UNICAST) - tb = dn_fib_get_table(RT_MIN_TABLE, 1); - else - tb = dn_fib_get_table(RT_TABLE_LOCAL, 1); - - if (tb == NULL) - return; - - req.nlh.nlmsg_len = sizeof(req); - req.nlh.nlmsg_type = cmd; - req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND; - req.nlh.nlmsg_pid = 0; - req.nlh.nlmsg_seq = 0; - - req.rtm.rtm_dst_len = dst_len; - req.rtm.rtm_table = tb->n; - req.rtm.rtm_protocol = RTPROT_KERNEL; - req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); - req.rtm.rtm_type = type; - - if (cmd == RTM_NEWROUTE) - tb->insert(tb, &req.rtm, attrs, &req.nlh, NULL); - else - tb->delete(tb, &req.rtm, attrs, &req.nlh, NULL); -} - -static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa) -{ - - fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa); - -#if 0 - if (!(dev->flags&IFF_UP)) - return; - /* In the future, we will want to add default routes here */ - -#endif -} - -static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa) -{ - int found_it = 0; - struct net_device *dev; - struct dn_dev *dn_db; - struct dn_ifaddr *ifa2; - - ASSERT_RTNL(); - - /* Scan device list */ - rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) { - dn_db = rcu_dereference(dev->dn_ptr); - if (dn_db == NULL) - continue; - for (ifa2 = rcu_dereference(dn_db->ifa_list); - ifa2 != NULL; - ifa2 = rcu_dereference(ifa2->ifa_next)) { - if (ifa2->ifa_local == ifa->ifa_local) { - found_it = 1; - break; - } - } - } - rcu_read_unlock(); - - if (found_it == 0) { - fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa); - - if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) { - if (dn_fib_sync_down(ifa->ifa_local, NULL, 0)) - dn_fib_flush(); - } - } -} - -static void dn_fib_disable_addr(struct net_device *dev, int force) -{ - if (dn_fib_sync_down(0, dev, force)) - dn_fib_flush(); - dn_rt_cache_flush(0); - neigh_ifdown(&dn_neigh_table, dev); -} - -static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr; - - switch (event) { - case NETDEV_UP: - dn_fib_add_ifaddr(ifa); - dn_fib_sync_up(ifa->ifa_dev->dev); - dn_rt_cache_flush(-1); - break; - case NETDEV_DOWN: - dn_fib_del_ifaddr(ifa); - if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { - dn_fib_disable_addr(ifa->ifa_dev->dev, 1); - } else { - dn_rt_cache_flush(-1); - } - break; - } - return NOTIFY_DONE; -} - -static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force) -{ - int ret = 0; - int scope = RT_SCOPE_NOWHERE; - - if (force) - scope = -1; - - for_fib_info() { - /* - * This makes no sense for DECnet.... we will almost - * certainly have more than one local address the same - * over all our interfaces. It needs thinking about - * some more. - */ - if (local && fi->fib_prefsrc == local) { - fi->fib_flags |= RTNH_F_DEAD; - ret++; - } else if (dev && fi->fib_nhs) { - int dead = 0; - - change_nexthops(fi) { - if (nh->nh_flags&RTNH_F_DEAD) - dead++; - else if (nh->nh_dev == dev && - nh->nh_scope != scope) { - spin_lock_bh(&dn_fib_multipath_lock); - nh->nh_flags |= RTNH_F_DEAD; - fi->fib_power -= nh->nh_power; - nh->nh_power = 0; - spin_unlock_bh(&dn_fib_multipath_lock); - dead++; - } - } endfor_nexthops(fi) - if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; - ret++; - } - } - } endfor_fib_info(); - return ret; -} - - -static int dn_fib_sync_up(struct net_device *dev) -{ - int ret = 0; - - if (!(dev->flags&IFF_UP)) - return 0; - - for_fib_info() { - int alive = 0; - - change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { - alive++; - continue; - } - if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) - continue; - if (nh->nh_dev != dev || dev->dn_ptr == NULL) - continue; - alive++; - spin_lock_bh(&dn_fib_multipath_lock); - nh->nh_power = 0; - nh->nh_flags &= ~RTNH_F_DEAD; - spin_unlock_bh(&dn_fib_multipath_lock); - } endfor_nexthops(fi); - - if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; - ret++; - } - } endfor_fib_info(); - return ret; -} - -static struct notifier_block dn_fib_dnaddr_notifier = { - .notifier_call = dn_fib_dnaddr_event, -}; - -void __exit dn_fib_cleanup(void) -{ - dn_fib_table_cleanup(); - dn_fib_rules_cleanup(); - - unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier); -} - - -void __init dn_fib_init(void) -{ - dn_fib_table_init(); - dn_fib_rules_init(); - - register_dnaddr_notifier(&dn_fib_dnaddr_notifier); - - rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE, - dn_fib_rtm_newroute, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE, - dn_fib_rtm_delroute, NULL, 0); -} diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c deleted file mode 100644 index 94b306f6d551..000000000000 --- a/net/decnet/dn_neigh.c +++ /dev/null @@ -1,605 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Neighbour Functions (Adjacency Database and - * On-Ethernet Cache) - * - * Author: Steve Whitehouse <SteveW@ACM.org> - * - * - * Changes: - * Steve Whitehouse : Fixed router listing routine - * Steve Whitehouse : Added error_report functions - * Steve Whitehouse : Added default router detection - * Steve Whitehouse : Hop counts in outgoing messages - * Steve Whitehouse : Fixed src/dst in outgoing messages so - * forwarding now stands a good chance of - * working. - * Steve Whitehouse : Fixed neighbour states (for now anyway). - * Steve Whitehouse : Made error_report functions dummies. This - * is not the right place to return skbs. - * Steve Whitehouse : Convert to seq_file - * - */ - -#include <linux/net.h> -#include <linux/module.h> -#include <linux/socket.h> -#include <linux/if_arp.h> -#include <linux/slab.h> -#include <linux/if_ether.h> -#include <linux/init.h> -#include <linux/proc_fs.h> -#include <linux/string.h> -#include <linux/netfilter_decnet.h> -#include <linux/spinlock.h> -#include <linux/seq_file.h> -#include <linux/rcupdate.h> -#include <linux/jhash.h> -#include <linux/atomic.h> -#include <net/net_namespace.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/dn.h> -#include <net/dn_dev.h> -#include <net/dn_neigh.h> -#include <net/dn_route.h> - -static int dn_neigh_construct(struct neighbour *); -static void dn_neigh_error_report(struct neighbour *, struct sk_buff *); -static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb); - -/* - * Operations for adding the link layer header. - */ -static const struct neigh_ops dn_neigh_ops = { - .family = AF_DECnet, - .error_report = dn_neigh_error_report, - .output = dn_neigh_output, - .connected_output = dn_neigh_output, -}; - -static u32 dn_neigh_hash(const void *pkey, - const struct net_device *dev, - __u32 *hash_rnd) -{ - return jhash_2words(*(__u16 *)pkey, 0, hash_rnd[0]); -} - -static bool dn_key_eq(const struct neighbour *neigh, const void *pkey) -{ - return neigh_key_eq16(neigh, pkey); -} - -struct neigh_table dn_neigh_table = { - .family = PF_DECnet, - .entry_size = NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)), - .key_len = sizeof(__le16), - .protocol = cpu_to_be16(ETH_P_DNA_RT), - .hash = dn_neigh_hash, - .key_eq = dn_key_eq, - .constructor = dn_neigh_construct, - .id = "dn_neigh_cache", - .parms ={ - .tbl = &dn_neigh_table, - .reachable_time = 30 * HZ, - .data = { - [NEIGH_VAR_MCAST_PROBES] = 0, - [NEIGH_VAR_UCAST_PROBES] = 0, - [NEIGH_VAR_APP_PROBES] = 0, - [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, - [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, - [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, - [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, - [NEIGH_VAR_PROXY_QLEN] = 0, - [NEIGH_VAR_ANYCAST_DELAY] = 0, - [NEIGH_VAR_PROXY_DELAY] = 0, - [NEIGH_VAR_LOCKTIME] = 1 * HZ, - }, - }, - .gc_interval = 30 * HZ, - .gc_thresh1 = 128, - .gc_thresh2 = 512, - .gc_thresh3 = 1024, -}; - -static int dn_neigh_construct(struct neighbour *neigh) -{ - struct net_device *dev = neigh->dev; - struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n); - struct dn_dev *dn_db; - struct neigh_parms *parms; - - rcu_read_lock(); - dn_db = rcu_dereference(dev->dn_ptr); - if (dn_db == NULL) { - rcu_read_unlock(); - return -EINVAL; - } - - parms = dn_db->neigh_parms; - if (!parms) { - rcu_read_unlock(); - return -EINVAL; - } - - __neigh_parms_put(neigh->parms); - neigh->parms = neigh_parms_clone(parms); - rcu_read_unlock(); - - neigh->ops = &dn_neigh_ops; - neigh->nud_state = NUD_NOARP; - neigh->output = neigh->ops->connected_output; - - if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT)) - memcpy(neigh->ha, dev->broadcast, dev->addr_len); - else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK)) - dn_dn2eth(neigh->ha, dn->addr); - else { - net_dbg_ratelimited("Trying to create neigh for hw %d\n", - dev->type); - return -EINVAL; - } - - /* - * Make an estimate of the remote block size by assuming that its - * two less then the device mtu, which it true for ethernet (and - * other things which support long format headers) since there is - * an extra length field (of 16 bits) which isn't part of the - * ethernet headers and which the DECnet specs won't admit is part - * of the DECnet routing headers either. - * - * If we over estimate here its no big deal, the NSP negotiations - * will prevent us from sending packets which are too large for the - * remote node to handle. In any case this figure is normally updated - * by a hello message in most cases. - */ - dn->blksize = dev->mtu - 2; - - return 0; -} - -static void dn_neigh_error_report(struct neighbour *neigh, struct sk_buff *skb) -{ - printk(KERN_DEBUG "dn_neigh_error_report: called\n"); - kfree_skb(skb); -} - -static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct dn_route *rt = (struct dn_route *)dst; - struct net_device *dev = neigh->dev; - char mac_addr[ETH_ALEN]; - unsigned int seq; - int err; - - dn_dn2eth(mac_addr, rt->rt_local_src); - do { - seq = read_seqbegin(&neigh->ha_lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, mac_addr, skb->len); - } while (read_seqretry(&neigh->ha_lock, seq)); - - if (err >= 0) - err = dev_queue_xmit(skb); - else { - kfree_skb(skb); - err = -EINVAL; - } - return err; -} - -static int dn_neigh_output_packet(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct dn_route *rt = (struct dn_route *)dst; - struct neighbour *neigh = rt->n; - - return neigh->output(neigh, skb); -} - -/* - * For talking to broadcast devices: Ethernet & PPP - */ -static int dn_long_output(struct neighbour *neigh, struct sock *sk, - struct sk_buff *skb) -{ - struct net_device *dev = neigh->dev; - int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3; - unsigned char *data; - struct dn_long_packet *lp; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - - - if (skb_headroom(skb) < headroom) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); - if (skb2 == NULL) { - net_crit_ratelimited("dn_long_output: no memory\n"); - kfree_skb(skb); - return -ENOBUFS; - } - consume_skb(skb); - skb = skb2; - net_info_ratelimited("dn_long_output: Increasing headroom\n"); - } - - data = skb_push(skb, sizeof(struct dn_long_packet) + 3); - lp = (struct dn_long_packet *)(data+3); - - *((__le16 *)data) = cpu_to_le16(skb->len - 2); - *(data + 2) = 1 | DN_RT_F_PF; /* Padding */ - - lp->msgflg = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS)); - lp->d_area = lp->d_subarea = 0; - dn_dn2eth(lp->d_id, cb->dst); - lp->s_area = lp->s_subarea = 0; - dn_dn2eth(lp->s_id, cb->src); - lp->nl2 = 0; - lp->visit_ct = cb->hops & 0x3f; - lp->s_class = 0; - lp->pt = 0; - - skb_reset_network_header(skb); - - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, - &init_net, sk, skb, NULL, neigh->dev, - dn_neigh_output_packet); -} - -/* - * For talking to pointopoint and multidrop devices: DDCMP and X.25 - */ -static int dn_short_output(struct neighbour *neigh, struct sock *sk, - struct sk_buff *skb) -{ - struct net_device *dev = neigh->dev; - int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; - struct dn_short_packet *sp; - unsigned char *data; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - - - if (skb_headroom(skb) < headroom) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); - if (skb2 == NULL) { - net_crit_ratelimited("dn_short_output: no memory\n"); - kfree_skb(skb); - return -ENOBUFS; - } - consume_skb(skb); - skb = skb2; - net_info_ratelimited("dn_short_output: Increasing headroom\n"); - } - - data = skb_push(skb, sizeof(struct dn_short_packet) + 2); - *((__le16 *)data) = cpu_to_le16(skb->len - 2); - sp = (struct dn_short_packet *)(data+2); - - sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS)); - sp->dstnode = cb->dst; - sp->srcnode = cb->src; - sp->forward = cb->hops & 0x3f; - - skb_reset_network_header(skb); - - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, - &init_net, sk, skb, NULL, neigh->dev, - dn_neigh_output_packet); -} - -/* - * For talking to DECnet phase III nodes - * Phase 3 output is the same as short output, execpt that - * it clears the area bits before transmission. - */ -static int dn_phase3_output(struct neighbour *neigh, struct sock *sk, - struct sk_buff *skb) -{ - struct net_device *dev = neigh->dev; - int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; - struct dn_short_packet *sp; - unsigned char *data; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - - if (skb_headroom(skb) < headroom) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); - if (skb2 == NULL) { - net_crit_ratelimited("dn_phase3_output: no memory\n"); - kfree_skb(skb); - return -ENOBUFS; - } - consume_skb(skb); - skb = skb2; - net_info_ratelimited("dn_phase3_output: Increasing headroom\n"); - } - - data = skb_push(skb, sizeof(struct dn_short_packet) + 2); - *((__le16 *)data) = cpu_to_le16(skb->len - 2); - sp = (struct dn_short_packet *)(data + 2); - - sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS)); - sp->dstnode = cb->dst & cpu_to_le16(0x03ff); - sp->srcnode = cb->src & cpu_to_le16(0x03ff); - sp->forward = cb->hops & 0x3f; - - skb_reset_network_header(skb); - - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, - &init_net, sk, skb, NULL, neigh->dev, - dn_neigh_output_packet); -} - -int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct dn_route *rt = (struct dn_route *) dst; - struct neighbour *neigh = rt->n; - struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n); - struct dn_dev *dn_db; - bool use_long; - - rcu_read_lock(); - dn_db = rcu_dereference(neigh->dev->dn_ptr); - if (dn_db == NULL) { - rcu_read_unlock(); - return -EINVAL; - } - use_long = dn_db->use_long; - rcu_read_unlock(); - - if (dn->flags & DN_NDFLAG_P3) - return dn_phase3_output(neigh, sk, skb); - if (use_long) - return dn_long_output(neigh, sk, skb); - else - return dn_short_output(neigh, sk, skb); -} - -/* - * Unfortunately, the neighbour code uses the device in its hash - * function, so we don't get any advantage from it. This function - * basically does a neigh_lookup(), but without comparing the device - * field. This is required for the On-Ethernet cache - */ - -/* - * Pointopoint link receives a hello message - */ -void dn_neigh_pointopoint_hello(struct sk_buff *skb) -{ - kfree_skb(skb); -} - -/* - * Ethernet router hello message received - */ -int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data; - - struct neighbour *neigh; - struct dn_neigh *dn; - struct dn_dev *dn_db; - __le16 src; - - src = dn_eth2dn(msg->id); - - neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); - - dn = container_of(neigh, struct dn_neigh, n); - - if (neigh) { - write_lock(&neigh->lock); - - neigh->used = jiffies; - dn_db = rcu_dereference(neigh->dev->dn_ptr); - - if (!(neigh->nud_state & NUD_PERMANENT)) { - neigh->updated = jiffies; - - if (neigh->dev->type == ARPHRD_ETHER) - memcpy(neigh->ha, ð_hdr(skb)->h_source, ETH_ALEN); - - dn->blksize = le16_to_cpu(msg->blksize); - dn->priority = msg->priority; - - dn->flags &= ~DN_NDFLAG_P3; - - switch (msg->iinfo & DN_RT_INFO_TYPE) { - case DN_RT_INFO_L1RT: - dn->flags &=~DN_NDFLAG_R2; - dn->flags |= DN_NDFLAG_R1; - break; - case DN_RT_INFO_L2RT: - dn->flags |= DN_NDFLAG_R2; - } - } - - /* Only use routers in our area */ - if ((le16_to_cpu(src)>>10) == (le16_to_cpu((decnet_address))>>10)) { - if (!dn_db->router) { - dn_db->router = neigh_clone(neigh); - } else { - if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) - neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); - } - } - write_unlock(&neigh->lock); - neigh_release(neigh); - } - - kfree_skb(skb); - return 0; -} - -/* - * Endnode hello message received - */ -int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data; - struct neighbour *neigh; - struct dn_neigh *dn; - __le16 src; - - src = dn_eth2dn(msg->id); - - neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); - - dn = container_of(neigh, struct dn_neigh, n); - - if (neigh) { - write_lock(&neigh->lock); - - neigh->used = jiffies; - - if (!(neigh->nud_state & NUD_PERMANENT)) { - neigh->updated = jiffies; - - if (neigh->dev->type == ARPHRD_ETHER) - memcpy(neigh->ha, ð_hdr(skb)->h_source, ETH_ALEN); - dn->flags &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2); - dn->blksize = le16_to_cpu(msg->blksize); - dn->priority = 0; - } - - write_unlock(&neigh->lock); - neigh_release(neigh); - } - - kfree_skb(skb); - return 0; -} - -static char *dn_find_slot(char *base, int max, int priority) -{ - int i; - unsigned char *min = NULL; - - base += 6; /* skip first id */ - - for(i = 0; i < max; i++) { - if (!min || (*base < *min)) - min = base; - base += 7; /* find next priority */ - } - - if (!min) - return NULL; - - return (*min < priority) ? (min - 6) : NULL; -} - -struct elist_cb_state { - struct net_device *dev; - unsigned char *ptr; - unsigned char *rs; - int t, n; -}; - -static void neigh_elist_cb(struct neighbour *neigh, void *_info) -{ - struct elist_cb_state *s = _info; - struct dn_neigh *dn; - - if (neigh->dev != s->dev) - return; - - dn = container_of(neigh, struct dn_neigh, n); - if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2))) - return; - - if (s->t == s->n) - s->rs = dn_find_slot(s->ptr, s->n, dn->priority); - else - s->t++; - if (s->rs == NULL) - return; - - dn_dn2eth(s->rs, dn->addr); - s->rs += 6; - *(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0; - *(s->rs) |= dn->priority; - s->rs++; -} - -int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n) -{ - struct elist_cb_state state; - - state.dev = dev; - state.t = 0; - state.n = n; - state.ptr = ptr; - state.rs = ptr; - - neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state); - - return state.t; -} - - -#ifdef CONFIG_PROC_FS - -static inline void dn_neigh_format_entry(struct seq_file *seq, - struct neighbour *n) -{ - struct dn_neigh *dn = container_of(n, struct dn_neigh, n); - char buf[DN_ASCBUF_LEN]; - - read_lock(&n->lock); - seq_printf(seq, "%-7s %s%s%s %02x %02d %07ld %-8s\n", - dn_addr2asc(le16_to_cpu(dn->addr), buf), - (dn->flags&DN_NDFLAG_R1) ? "1" : "-", - (dn->flags&DN_NDFLAG_R2) ? "2" : "-", - (dn->flags&DN_NDFLAG_P3) ? "3" : "-", - dn->n.nud_state, - refcount_read(&dn->n.refcnt), - dn->blksize, - (dn->n.dev) ? dn->n.dev->name : "?"); - read_unlock(&n->lock); -} - -static int dn_neigh_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) { - seq_puts(seq, "Addr Flags State Use Blksize Dev\n"); - } else { - dn_neigh_format_entry(seq, v); - } - - return 0; -} - -static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos) -{ - return neigh_seq_start(seq, pos, &dn_neigh_table, - NEIGH_SEQ_NEIGH_ONLY); -} - -static const struct seq_operations dn_neigh_seq_ops = { - .start = dn_neigh_seq_start, - .next = neigh_seq_next, - .stop = neigh_seq_stop, - .show = dn_neigh_seq_show, -}; -#endif - -void __init dn_neigh_init(void) -{ - neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table); - proc_create_net("decnet_neigh", 0444, init_net.proc_net, - &dn_neigh_seq_ops, sizeof(struct neigh_seq_state)); -} - -void __exit dn_neigh_cleanup(void) -{ - remove_proc_entry("decnet_neigh", init_net.proc_net); - neigh_table_clear(NEIGH_DN_TABLE, &dn_neigh_table); -} diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c deleted file mode 100644 index e4161e0c86aa..000000000000 --- a/net/decnet/dn_nsp_in.c +++ /dev/null @@ -1,906 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Network Services Protocol (Input) - * - * Author: Eduardo Marcelo Serrat <emserrat@geocities.com> - * - * Changes: - * - * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from - * original dn_nsp.c. - * Steve Whitehouse: Updated to work with my new routing architecture. - * Steve Whitehouse: Add changes from Eduardo Serrat's patches. - * Steve Whitehouse: Put all ack handling code in a common routine. - * Steve Whitehouse: Put other common bits into dn_nsp_rx() - * Steve Whitehouse: More checks on skb->len to catch bogus packets - * Fixed various race conditions and possible nasties. - * Steve Whitehouse: Now handles returned conninit frames. - * David S. Miller: New socket locking - * Steve Whitehouse: Fixed lockup when socket filtering was enabled. - * Paul Koning: Fix to push CC sockets into RUN when acks are - * received. - * Steve Whitehouse: - * Patrick Caulfield: Checking conninits for correctness & sending of error - * responses. - * Steve Whitehouse: Added backlog congestion level return codes. - * Patrick Caulfield: - * Steve Whitehouse: Added flow control support (outbound) - * Steve Whitehouse: Prepare for nonlinear skbs - */ - -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - -*******************************************************************************/ - -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/string.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/inet.h> -#include <linux/route.h> -#include <linux/slab.h> -#include <net/sock.h> -#include <net/tcp_states.h> -#include <linux/fcntl.h> -#include <linux/mm.h> -#include <linux/termios.h> -#include <linux/interrupt.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/init.h> -#include <linux/poll.h> -#include <linux/netfilter_decnet.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/dn.h> -#include <net/dn_nsp.h> -#include <net/dn_dev.h> -#include <net/dn_route.h> - -extern int decnet_log_martians; - -static void dn_log_martian(struct sk_buff *skb, const char *msg) -{ - if (decnet_log_martians) { - char *devname = skb->dev ? skb->dev->name : "???"; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - net_info_ratelimited("DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n", - msg, devname, - le16_to_cpu(cb->src), - le16_to_cpu(cb->dst), - le16_to_cpu(cb->src_port), - le16_to_cpu(cb->dst_port)); - } -} - -/* - * For this function we've flipped the cross-subchannel bit - * if the message is an otherdata or linkservice message. Thus - * we can use it to work out what to update. - */ -static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack) -{ - struct dn_scp *scp = DN_SK(sk); - unsigned short type = ((ack >> 12) & 0x0003); - int wakeup = 0; - - switch (type) { - case 0: /* ACK - Data */ - if (dn_after(ack, scp->ackrcv_dat)) { - scp->ackrcv_dat = ack & 0x0fff; - wakeup |= dn_nsp_check_xmit_queue(sk, skb, - &scp->data_xmit_queue, - ack); - } - break; - case 1: /* NAK - Data */ - break; - case 2: /* ACK - OtherData */ - if (dn_after(ack, scp->ackrcv_oth)) { - scp->ackrcv_oth = ack & 0x0fff; - wakeup |= dn_nsp_check_xmit_queue(sk, skb, - &scp->other_xmit_queue, - ack); - } - break; - case 3: /* NAK - OtherData */ - break; - } - - if (wakeup && !sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); -} - -/* - * This function is a universal ack processor. - */ -static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth) -{ - __le16 *ptr = (__le16 *)skb->data; - int len = 0; - unsigned short ack; - - if (skb->len < 2) - return len; - - if ((ack = le16_to_cpu(*ptr)) & 0x8000) { - skb_pull(skb, 2); - ptr++; - len += 2; - if ((ack & 0x4000) == 0) { - if (oth) - ack ^= 0x2000; - dn_ack(sk, skb, ack); - } - } - - if (skb->len < 2) - return len; - - if ((ack = le16_to_cpu(*ptr)) & 0x8000) { - skb_pull(skb, 2); - len += 2; - if ((ack & 0x4000) == 0) { - if (oth) - ack ^= 0x2000; - dn_ack(sk, skb, ack); - } - } - - return len; -} - - -/** - * dn_check_idf - Check an image data field format is correct. - * @pptr: Pointer to pointer to image data - * @len: Pointer to length of image data - * @max: The maximum allowed length of the data in the image data field - * @follow_on: Check that this many bytes exist beyond the end of the image data - * - * Returns: 0 if ok, -1 on error - */ -static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on) -{ - unsigned char *ptr = *pptr; - unsigned char flen = *ptr++; - - (*len)--; - if (flen > max) - return -1; - if ((flen + follow_on) > *len) - return -1; - - *len -= flen; - *pptr = ptr + flen; - return 0; -} - -/* - * Table of reason codes to pass back to node which sent us a badly - * formed message, plus text messages for the log. A zero entry in - * the reason field means "don't reply" otherwise a disc init is sent with - * the specified reason code. - */ -static struct { - unsigned short reason; - const char *text; -} ci_err_table[] = { - { 0, "CI: Truncated message" }, - { NSP_REASON_ID, "CI: Destination username error" }, - { NSP_REASON_ID, "CI: Destination username type" }, - { NSP_REASON_US, "CI: Source username error" }, - { 0, "CI: Truncated at menuver" }, - { 0, "CI: Truncated before access or user data" }, - { NSP_REASON_IO, "CI: Access data format error" }, - { NSP_REASON_IO, "CI: User data format error" } -}; - -/* - * This function uses a slightly different lookup method - * to find its sockets, since it searches on object name/number - * rather than port numbers. Various tests are done to ensure that - * the incoming data is in the correct format before it is queued to - * a socket. - */ -static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data; - struct sockaddr_dn dstaddr; - struct sockaddr_dn srcaddr; - unsigned char type = 0; - int dstlen; - int srclen; - unsigned char *ptr; - int len; - int err = 0; - unsigned char menuver; - - memset(&dstaddr, 0, sizeof(struct sockaddr_dn)); - memset(&srcaddr, 0, sizeof(struct sockaddr_dn)); - - /* - * 1. Decode & remove message header - */ - cb->src_port = msg->srcaddr; - cb->dst_port = msg->dstaddr; - cb->services = msg->services; - cb->info = msg->info; - cb->segsize = le16_to_cpu(msg->segsize); - - if (!pskb_may_pull(skb, sizeof(*msg))) - goto err_out; - - skb_pull(skb, sizeof(*msg)); - - len = skb->len; - ptr = skb->data; - - /* - * 2. Check destination end username format - */ - dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type); - err++; - if (dstlen < 0) - goto err_out; - - err++; - if (type > 1) - goto err_out; - - len -= dstlen; - ptr += dstlen; - - /* - * 3. Check source end username format - */ - srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type); - err++; - if (srclen < 0) - goto err_out; - - len -= srclen; - ptr += srclen; - err++; - if (len < 1) - goto err_out; - - menuver = *ptr; - ptr++; - len--; - - /* - * 4. Check that optional data actually exists if menuver says it does - */ - err++; - if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1)) - goto err_out; - - /* - * 5. Check optional access data format - */ - err++; - if (menuver & DN_MENUVER_ACC) { - if (dn_check_idf(&ptr, &len, 39, 1)) - goto err_out; - if (dn_check_idf(&ptr, &len, 39, 1)) - goto err_out; - if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0)) - goto err_out; - } - - /* - * 6. Check optional user data format - */ - err++; - if (menuver & DN_MENUVER_USR) { - if (dn_check_idf(&ptr, &len, 16, 0)) - goto err_out; - } - - /* - * 7. Look up socket based on destination end username - */ - return dn_sklist_find_listener(&dstaddr); -err_out: - dn_log_martian(skb, ci_err_table[err].text); - *reason = ci_err_table[err].reason; - return NULL; -} - - -static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb) -{ - if (sk_acceptq_is_full(sk)) { - kfree_skb(skb); - return; - } - - sk->sk_ack_backlog++; - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_state_change(sk); -} - -static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct dn_scp *scp = DN_SK(sk); - unsigned char *ptr; - - if (skb->len < 4) - goto out; - - ptr = skb->data; - cb->services = *ptr++; - cb->info = *ptr++; - cb->segsize = le16_to_cpu(*(__le16 *)ptr); - - if ((scp->state == DN_CI) || (scp->state == DN_CD)) { - scp->persist = 0; - scp->addrrem = cb->src_port; - sk->sk_state = TCP_ESTABLISHED; - scp->state = DN_RUN; - scp->services_rem = cb->services; - scp->info_rem = cb->info; - scp->segsize_rem = cb->segsize; - - if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE) - scp->max_window = decnet_no_fc_max_cwnd; - - if (skb->len > 0) { - u16 dlen = *skb->data; - if ((dlen <= 16) && (dlen <= skb->len)) { - scp->conndata_in.opt_optl = cpu_to_le16(dlen); - skb_copy_from_linear_data_offset(skb, 1, - scp->conndata_in.opt_data, dlen); - } - } - dn_nsp_send_link(sk, DN_NOCHANGE, 0); - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - } - -out: - kfree_skb(skb); -} - -static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb) -{ - struct dn_scp *scp = DN_SK(sk); - - if (scp->state == DN_CI) { - scp->state = DN_CD; - scp->persist = 0; - } - - kfree_skb(skb); -} - -static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb) -{ - struct dn_scp *scp = DN_SK(sk); - struct dn_skb_cb *cb = DN_SKB_CB(skb); - unsigned short reason; - - if (skb->len < 2) - goto out; - - reason = le16_to_cpu(*(__le16 *)skb->data); - skb_pull(skb, 2); - - scp->discdata_in.opt_status = cpu_to_le16(reason); - scp->discdata_in.opt_optl = 0; - memset(scp->discdata_in.opt_data, 0, 16); - - if (skb->len > 0) { - u16 dlen = *skb->data; - if ((dlen <= 16) && (dlen <= skb->len)) { - scp->discdata_in.opt_optl = cpu_to_le16(dlen); - skb_copy_from_linear_data_offset(skb, 1, scp->discdata_in.opt_data, dlen); - } - } - - scp->addrrem = cb->src_port; - sk->sk_state = TCP_CLOSE; - - switch (scp->state) { - case DN_CI: - case DN_CD: - scp->state = DN_RJ; - sk->sk_err = ECONNREFUSED; - break; - case DN_RUN: - sk->sk_shutdown |= SHUTDOWN_MASK; - scp->state = DN_DN; - break; - case DN_DI: - scp->state = DN_DIC; - break; - } - - if (!sock_flag(sk, SOCK_DEAD)) { - if (sk->sk_socket->state != SS_UNCONNECTED) - sk->sk_socket->state = SS_DISCONNECTING; - sk->sk_state_change(sk); - } - - /* - * It appears that its possible for remote machines to send disc - * init messages with no port identifier if we are in the CI and - * possibly also the CD state. Obviously we shouldn't reply with - * a message if we don't know what the end point is. - */ - if (scp->addrrem) { - dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC); - } - scp->persist_fxn = dn_destroy_timer; - scp->persist = dn_nsp_persist(sk); - -out: - kfree_skb(skb); -} - -/* - * disc_conf messages are also called no_resources or no_link - * messages depending upon the "reason" field. - */ -static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb) -{ - struct dn_scp *scp = DN_SK(sk); - unsigned short reason; - - if (skb->len != 2) - goto out; - - reason = le16_to_cpu(*(__le16 *)skb->data); - - sk->sk_state = TCP_CLOSE; - - switch (scp->state) { - case DN_CI: - scp->state = DN_NR; - break; - case DN_DR: - if (reason == NSP_REASON_DC) - scp->state = DN_DRC; - if (reason == NSP_REASON_NL) - scp->state = DN_CN; - break; - case DN_DI: - scp->state = DN_DIC; - break; - case DN_RUN: - sk->sk_shutdown |= SHUTDOWN_MASK; - /* fall through */ - case DN_CC: - scp->state = DN_CN; - } - - if (!sock_flag(sk, SOCK_DEAD)) { - if (sk->sk_socket->state != SS_UNCONNECTED) - sk->sk_socket->state = SS_DISCONNECTING; - sk->sk_state_change(sk); - } - - scp->persist_fxn = dn_destroy_timer; - scp->persist = dn_nsp_persist(sk); - -out: - kfree_skb(skb); -} - -static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb) -{ - struct dn_scp *scp = DN_SK(sk); - unsigned short segnum; - unsigned char lsflags; - signed char fcval; - int wake_up = 0; - char *ptr = skb->data; - unsigned char fctype = scp->services_rem & NSP_FC_MASK; - - if (skb->len != 4) - goto out; - - segnum = le16_to_cpu(*(__le16 *)ptr); - ptr += 2; - lsflags = *(unsigned char *)ptr++; - fcval = *ptr; - - /* - * Here we ignore erronous packets which should really - * should cause a connection abort. It is not critical - * for now though. - */ - if (lsflags & 0xf8) - goto out; - - if (seq_next(scp->numoth_rcv, segnum)) { - seq_add(&scp->numoth_rcv, 1); - switch(lsflags & 0x04) { /* FCVAL INT */ - case 0x00: /* Normal Request */ - switch(lsflags & 0x03) { /* FCVAL MOD */ - case 0x00: /* Request count */ - if (fcval < 0) { - unsigned char p_fcval = -fcval; - if ((scp->flowrem_dat > p_fcval) && - (fctype == NSP_FC_SCMC)) { - scp->flowrem_dat -= p_fcval; - } - } else if (fcval > 0) { - scp->flowrem_dat += fcval; - wake_up = 1; - } - break; - case 0x01: /* Stop outgoing data */ - scp->flowrem_sw = DN_DONTSEND; - break; - case 0x02: /* Ok to start again */ - scp->flowrem_sw = DN_SEND; - dn_nsp_output(sk); - wake_up = 1; - } - break; - case 0x04: /* Interrupt Request */ - if (fcval > 0) { - scp->flowrem_oth += fcval; - wake_up = 1; - } - break; - } - if (wake_up && !sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - } - - dn_nsp_send_oth_ack(sk); - -out: - kfree_skb(skb); -} - -/* - * Copy of sock_queue_rcv_skb (from sock.h) without - * bh_lock_sock() (its already held when this is called) which - * also allows data and other data to be queued to a socket. - */ -static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue) -{ - int err; - - /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces - number of warnings when compiling with -W --ANK - */ - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned int)sk->sk_rcvbuf) { - err = -ENOMEM; - goto out; - } - - err = sk_filter(sk, skb); - if (err) - goto out; - - skb_set_owner_r(skb, sk); - skb_queue_tail(queue, skb); - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); -out: - return err; -} - -static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb) -{ - struct dn_scp *scp = DN_SK(sk); - unsigned short segnum; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - int queued = 0; - - if (skb->len < 2) - goto out; - - cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data); - skb_pull(skb, 2); - - if (seq_next(scp->numoth_rcv, segnum)) { - - if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) { - seq_add(&scp->numoth_rcv, 1); - scp->other_report = 0; - queued = 1; - } - } - - dn_nsp_send_oth_ack(sk); -out: - if (!queued) - kfree_skb(skb); -} - -static void dn_nsp_data(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - unsigned short segnum; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct dn_scp *scp = DN_SK(sk); - - if (skb->len < 2) - goto out; - - cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data); - skb_pull(skb, 2); - - if (seq_next(scp->numdat_rcv, segnum)) { - if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) { - seq_add(&scp->numdat_rcv, 1); - queued = 1; - } - - if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) { - scp->flowloc_sw = DN_DONTSEND; - dn_nsp_send_link(sk, DN_DONTSEND, 0); - } - } - - dn_nsp_send_data_ack(sk); -out: - if (!queued) - kfree_skb(skb); -} - -/* - * If one of our conninit messages is returned, this function - * deals with it. It puts the socket into the NO_COMMUNICATION - * state. - */ -static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb) -{ - struct dn_scp *scp = DN_SK(sk); - - if (scp->state == DN_CI) { - scp->state = DN_NC; - sk->sk_state = TCP_CLOSE; - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - } - - kfree_skb(skb); -} - -static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - int ret = NET_RX_DROP; - - /* Must not reply to returned packets */ - if (cb->rt_flags & DN_RT_F_RTS) - goto out; - - if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) { - switch (cb->nsp_flags & 0x70) { - case 0x10: - case 0x60: /* (Retransmitted) Connect Init */ - dn_nsp_return_disc(skb, NSP_DISCINIT, reason); - ret = NET_RX_SUCCESS; - break; - case 0x20: /* Connect Confirm */ - dn_nsp_return_disc(skb, NSP_DISCCONF, reason); - ret = NET_RX_SUCCESS; - break; - } - } - -out: - kfree_skb(skb); - return ret; -} - -static int dn_nsp_rx_packet(struct net *net, struct sock *sk2, - struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct sock *sk = NULL; - unsigned char *ptr = (unsigned char *)skb->data; - unsigned short reason = NSP_REASON_NL; - - if (!pskb_may_pull(skb, 2)) - goto free_out; - - skb_reset_transport_header(skb); - cb->nsp_flags = *ptr++; - - if (decnet_debug_level & 2) - printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags); - - if (cb->nsp_flags & 0x83) - goto free_out; - - /* - * Filter out conninits and useless packet types - */ - if ((cb->nsp_flags & 0x0c) == 0x08) { - switch (cb->nsp_flags & 0x70) { - case 0x00: /* NOP */ - case 0x70: /* Reserved */ - case 0x50: /* Reserved, Phase II node init */ - goto free_out; - case 0x10: - case 0x60: - if (unlikely(cb->rt_flags & DN_RT_F_RTS)) - goto free_out; - sk = dn_find_listener(skb, &reason); - goto got_it; - } - } - - if (!pskb_may_pull(skb, 3)) - goto free_out; - - /* - * Grab the destination address. - */ - cb->dst_port = *(__le16 *)ptr; - cb->src_port = 0; - ptr += 2; - - /* - * If not a connack, grab the source address too. - */ - if (pskb_may_pull(skb, 5)) { - cb->src_port = *(__le16 *)ptr; - ptr += 2; - skb_pull(skb, 5); - } - - /* - * Returned packets... - * Swap src & dst and look up in the normal way. - */ - if (unlikely(cb->rt_flags & DN_RT_F_RTS)) { - swap(cb->dst_port, cb->src_port); - swap(cb->dst, cb->src); - } - - /* - * Find the socket to which this skb is destined. - */ - sk = dn_find_by_skb(skb); -got_it: - if (sk != NULL) { - struct dn_scp *scp = DN_SK(sk); - - /* Reset backoff */ - scp->nsp_rxtshift = 0; - - /* - * We linearize everything except data segments here. - */ - if (cb->nsp_flags & ~0x60) { - if (unlikely(skb_linearize(skb))) - goto free_out; - } - - return sk_receive_skb(sk, skb, 0); - } - - return dn_nsp_no_socket(skb, reason); - -free_out: - kfree_skb(skb); - return NET_RX_DROP; -} - -int dn_nsp_rx(struct sk_buff *skb) -{ - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, - &init_net, NULL, skb, skb->dev, NULL, - dn_nsp_rx_packet); -} - -/* - * This is the main receive routine for sockets. It is called - * from the above when the socket is not busy, and also from - * sock_release() when there is a backlog queued up. - */ -int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct dn_scp *scp = DN_SK(sk); - struct dn_skb_cb *cb = DN_SKB_CB(skb); - - if (cb->rt_flags & DN_RT_F_RTS) { - if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68) - dn_returned_conn_init(sk, skb); - else - kfree_skb(skb); - return NET_RX_SUCCESS; - } - - /* - * Control packet. - */ - if ((cb->nsp_flags & 0x0c) == 0x08) { - switch (cb->nsp_flags & 0x70) { - case 0x10: - case 0x60: - dn_nsp_conn_init(sk, skb); - break; - case 0x20: - dn_nsp_conn_conf(sk, skb); - break; - case 0x30: - dn_nsp_disc_init(sk, skb); - break; - case 0x40: - dn_nsp_disc_conf(sk, skb); - break; - } - - } else if (cb->nsp_flags == 0x24) { - /* - * Special for connacks, 'cos they don't have - * ack data or ack otherdata info. - */ - dn_nsp_conn_ack(sk, skb); - } else { - int other = 1; - - /* both data and ack frames can kick a CC socket into RUN */ - if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) { - scp->state = DN_RUN; - sk->sk_state = TCP_ESTABLISHED; - sk->sk_state_change(sk); - } - - if ((cb->nsp_flags & 0x1c) == 0) - other = 0; - if (cb->nsp_flags == 0x04) - other = 0; - - /* - * Read out ack data here, this applies equally - * to data, other data, link serivce and both - * ack data and ack otherdata. - */ - dn_process_ack(sk, skb, other); - - /* - * If we've some sort of data here then call a - * suitable routine for dealing with it, otherwise - * the packet is an ack and can be discarded. - */ - if ((cb->nsp_flags & 0x0c) == 0) { - - if (scp->state != DN_RUN) - goto free_out; - - switch (cb->nsp_flags) { - case 0x10: /* LS */ - dn_nsp_linkservice(sk, skb); - break; - case 0x30: /* OD */ - dn_nsp_otherdata(sk, skb); - break; - default: - dn_nsp_data(sk, skb); - } - - } else { /* Ack, chuck it out here */ -free_out: - kfree_skb(skb); - } - } - - return NET_RX_SUCCESS; -} diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c deleted file mode 100644 index 00f2ed721ec1..000000000000 --- a/net/decnet/dn_nsp_out.c +++ /dev/null @@ -1,695 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Network Services Protocol (Output) - * - * Author: Eduardo Marcelo Serrat <emserrat@geocities.com> - * - * Changes: - * - * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from - * original dn_nsp.c. - * Steve Whitehouse: Updated to work with my new routing architecture. - * Steve Whitehouse: Added changes from Eduardo Serrat's patches. - * Steve Whitehouse: Now conninits have the "return" bit set. - * Steve Whitehouse: Fixes to check alloc'd skbs are non NULL! - * Moved output state machine into one function - * Steve Whitehouse: New output state machine - * Paul Koning: Connect Confirm message fix. - * Eduardo Serrat: Fix to stop dn_nsp_do_disc() sending malformed packets. - * Steve Whitehouse: dn_nsp_output() and friends needed a spring clean - * Steve Whitehouse: Moved dn_nsp_send() in here from route.h - */ - -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - -*******************************************************************************/ - -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/string.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/inet.h> -#include <linux/route.h> -#include <linux/slab.h> -#include <net/sock.h> -#include <linux/fcntl.h> -#include <linux/mm.h> -#include <linux/termios.h> -#include <linux/interrupt.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/init.h> -#include <linux/poll.h> -#include <linux/if_packet.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/dn.h> -#include <net/dn_nsp.h> -#include <net/dn_dev.h> -#include <net/dn_route.h> - - -static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; - -static void dn_nsp_send(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - struct dn_scp *scp = DN_SK(sk); - struct dst_entry *dst; - struct flowidn fld; - - skb_reset_transport_header(skb); - scp->stamp = jiffies; - - dst = sk_dst_check(sk, 0); - if (dst) { -try_again: - skb_dst_set(skb, dst); - dst_output(&init_net, skb->sk, skb); - return; - } - - memset(&fld, 0, sizeof(fld)); - fld.flowidn_oif = sk->sk_bound_dev_if; - fld.saddr = dn_saddr2dn(&scp->addr); - fld.daddr = dn_saddr2dn(&scp->peer); - dn_sk_ports_copy(&fld, scp); - fld.flowidn_proto = DNPROTO_NSP; - if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, 0) == 0) { - dst = sk_dst_get(sk); - sk->sk_route_caps = dst->dev->features; - goto try_again; - } - - sk->sk_err = EHOSTUNREACH; - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); -} - - -/* - * If sk == NULL, then we assume that we are supposed to be making - * a routing layer skb. If sk != NULL, then we are supposed to be - * creating an skb for the NSP layer. - * - * The eventual aim is for each socket to have a cached header size - * for its outgoing packets, and to set hdr from this when sk != NULL. - */ -struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri) -{ - struct sk_buff *skb; - int hdr = 64; - - if ((skb = alloc_skb(size + hdr, pri)) == NULL) - return NULL; - - skb->protocol = htons(ETH_P_DNA_RT); - skb->pkt_type = PACKET_OUTGOING; - - if (sk) - skb_set_owner_w(skb, sk); - - skb_reserve(skb, hdr); - - return skb; -} - -/* - * Calculate persist timer based upon the smoothed round - * trip time and the variance. Backoff according to the - * nsp_backoff[] array. - */ -unsigned long dn_nsp_persist(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1; - - t *= nsp_backoff[scp->nsp_rxtshift]; - - if (t < HZ) t = HZ; - if (t > (600*HZ)) t = (600*HZ); - - if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT) - scp->nsp_rxtshift++; - - /* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */ - - return t; -} - -/* - * This is called each time we get an estimate for the rtt - * on the link. - */ -static void dn_nsp_rtt(struct sock *sk, long rtt) -{ - struct dn_scp *scp = DN_SK(sk); - long srtt = (long)scp->nsp_srtt; - long rttvar = (long)scp->nsp_rttvar; - long delta; - - /* - * If the jiffies clock flips over in the middle of timestamp - * gathering this value might turn out negative, so we make sure - * that is it always positive here. - */ - if (rtt < 0) - rtt = -rtt; - /* - * Add new rtt to smoothed average - */ - delta = ((rtt << 3) - srtt); - srtt += (delta >> 3); - if (srtt >= 1) - scp->nsp_srtt = (unsigned long)srtt; - else - scp->nsp_srtt = 1; - - /* - * Add new rtt varience to smoothed varience - */ - delta >>= 1; - rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2); - if (rttvar >= 1) - scp->nsp_rttvar = (unsigned long)rttvar; - else - scp->nsp_rttvar = 1; - - /* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */ -} - -/** - * dn_nsp_clone_and_send - Send a data packet by cloning it - * @skb: The packet to clone and transmit - * @gfp: memory allocation flag - * - * Clone a queued data or other data packet and transmit it. - * - * Returns: The number of times the packet has been sent previously - */ -static inline unsigned int dn_nsp_clone_and_send(struct sk_buff *skb, - gfp_t gfp) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct sk_buff *skb2; - int ret = 0; - - if ((skb2 = skb_clone(skb, gfp)) != NULL) { - ret = cb->xmit_count; - cb->xmit_count++; - cb->stamp = jiffies; - skb2->sk = skb->sk; - dn_nsp_send(skb2); - } - - return ret; -} - -/** - * dn_nsp_output - Try and send something from socket queues - * @sk: The socket whose queues are to be investigated - * - * Try and send the packet on the end of the data and other data queues. - * Other data gets priority over data, and if we retransmit a packet we - * reduce the window by dividing it in two. - * - */ -void dn_nsp_output(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - struct sk_buff *skb; - unsigned int reduce_win = 0; - - /* - * First we check for otherdata/linkservice messages - */ - if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL) - reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC); - - /* - * If we may not send any data, we don't. - * If we are still trying to get some other data down the - * channel, we don't try and send any data. - */ - if (reduce_win || (scp->flowrem_sw != DN_SEND)) - goto recalc_window; - - if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL) - reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC); - - /* - * If we've sent any frame more than once, we cut the - * send window size in half. There is always a minimum - * window size of one available. - */ -recalc_window: - if (reduce_win) { - scp->snd_window >>= 1; - if (scp->snd_window < NSP_MIN_WINDOW) - scp->snd_window = NSP_MIN_WINDOW; - } -} - -int dn_nsp_xmit_timeout(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - dn_nsp_output(sk); - - if (!skb_queue_empty(&scp->data_xmit_queue) || - !skb_queue_empty(&scp->other_xmit_queue)) - scp->persist = dn_nsp_persist(sk); - - return 0; -} - -static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len) -{ - unsigned char *ptr = skb_push(skb, len); - - BUG_ON(len < 5); - - *ptr++ = msgflag; - *((__le16 *)ptr) = scp->addrrem; - ptr += 2; - *((__le16 *)ptr) = scp->addrloc; - ptr += 2; - return (__le16 __force *)ptr; -} - -static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other) -{ - struct dn_scp *scp = DN_SK(sk); - unsigned short acknum = scp->numdat_rcv & 0x0FFF; - unsigned short ackcrs = scp->numoth_rcv & 0x0FFF; - __le16 *ptr; - - BUG_ON(hlen < 9); - - scp->ackxmt_dat = acknum; - scp->ackxmt_oth = ackcrs; - acknum |= 0x8000; - ackcrs |= 0x8000; - - /* If this is an "other data/ack" message, swap acknum and ackcrs */ - if (other) - swap(acknum, ackcrs); - - /* Set "cross subchannel" bit in ackcrs */ - ackcrs |= 0x2000; - - ptr = dn_mk_common_header(scp, skb, msgflag, hlen); - - *ptr++ = cpu_to_le16(acknum); - *ptr++ = cpu_to_le16(ackcrs); - - return ptr; -} - -static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth) -{ - struct dn_scp *scp = DN_SK(sk); - struct dn_skb_cb *cb = DN_SKB_CB(skb); - __le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth); - - if (unlikely(oth)) { - cb->segnum = scp->numoth; - seq_add(&scp->numoth, 1); - } else { - cb->segnum = scp->numdat; - seq_add(&scp->numdat, 1); - } - *(ptr++) = cpu_to_le16(cb->segnum); - - return ptr; -} - -void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, - gfp_t gfp, int oth) -{ - struct dn_scp *scp = DN_SK(sk); - struct dn_skb_cb *cb = DN_SKB_CB(skb); - unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1; - - cb->xmit_count = 0; - dn_nsp_mk_data_header(sk, skb, oth); - - /* - * Slow start: If we have been idle for more than - * one RTT, then reset window to min size. - */ - if ((jiffies - scp->stamp) > t) - scp->snd_window = NSP_MIN_WINDOW; - - if (oth) - skb_queue_tail(&scp->other_xmit_queue, skb); - else - skb_queue_tail(&scp->data_xmit_queue, skb); - - if (scp->flowrem_sw != DN_SEND) - return; - - dn_nsp_clone_and_send(skb, gfp); -} - - -int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct dn_scp *scp = DN_SK(sk); - struct sk_buff *skb2, *n, *ack = NULL; - int wakeup = 0; - int try_retrans = 0; - unsigned long reftime = cb->stamp; - unsigned long pkttime; - unsigned short xmit_count; - unsigned short segnum; - - skb_queue_walk_safe(q, skb2, n) { - struct dn_skb_cb *cb2 = DN_SKB_CB(skb2); - - if (dn_before_or_equal(cb2->segnum, acknum)) - ack = skb2; - - /* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */ - - if (ack == NULL) - continue; - - /* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */ - - /* Does _last_ packet acked have xmit_count > 1 */ - try_retrans = 0; - /* Remember to wake up the sending process */ - wakeup = 1; - /* Keep various statistics */ - pkttime = cb2->stamp; - xmit_count = cb2->xmit_count; - segnum = cb2->segnum; - /* Remove and drop ack'ed packet */ - skb_unlink(ack, q); - kfree_skb(ack); - ack = NULL; - - /* - * We don't expect to see acknowledgements for packets we - * haven't sent yet. - */ - WARN_ON(xmit_count == 0); - - /* - * If the packet has only been sent once, we can use it - * to calculate the RTT and also open the window a little - * further. - */ - if (xmit_count == 1) { - if (dn_equal(segnum, acknum)) - dn_nsp_rtt(sk, (long)(pkttime - reftime)); - - if (scp->snd_window < scp->max_window) - scp->snd_window++; - } - - /* - * Packet has been sent more than once. If this is the last - * packet to be acknowledged then we want to send the next - * packet in the send queue again (assumes the remote host does - * go-back-N error control). - */ - if (xmit_count > 1) - try_retrans = 1; - } - - if (try_retrans) - dn_nsp_output(sk); - - return wakeup; -} - -void dn_nsp_send_data_ack(struct sock *sk) -{ - struct sk_buff *skb = NULL; - - if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL) - return; - - skb_reserve(skb, 9); - dn_mk_ack_header(sk, skb, 0x04, 9, 0); - dn_nsp_send(skb); -} - -void dn_nsp_send_oth_ack(struct sock *sk) -{ - struct sk_buff *skb = NULL; - - if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL) - return; - - skb_reserve(skb, 9); - dn_mk_ack_header(sk, skb, 0x14, 9, 1); - dn_nsp_send(skb); -} - - -void dn_send_conn_ack (struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - struct sk_buff *skb = NULL; - struct nsp_conn_ack_msg *msg; - - if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL) - return; - - msg = skb_put(skb, 3); - msg->msgflg = 0x24; - msg->dstaddr = scp->addrrem; - - dn_nsp_send(skb); -} - -static int dn_nsp_retrans_conn_conf(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - if (scp->state == DN_CC) - dn_send_conn_conf(sk, GFP_ATOMIC); - - return 0; -} - -void dn_send_conn_conf(struct sock *sk, gfp_t gfp) -{ - struct dn_scp *scp = DN_SK(sk); - struct sk_buff *skb = NULL; - struct nsp_conn_init_msg *msg; - __u8 len = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); - - if ((skb = dn_alloc_skb(sk, 50 + len, gfp)) == NULL) - return; - - msg = skb_put(skb, sizeof(*msg)); - msg->msgflg = 0x28; - msg->dstaddr = scp->addrrem; - msg->srcaddr = scp->addrloc; - msg->services = scp->services_loc; - msg->info = scp->info_loc; - msg->segsize = cpu_to_le16(scp->segsize_loc); - - skb_put_u8(skb, len); - - if (len > 0) - skb_put_data(skb, scp->conndata_out.opt_data, len); - - - dn_nsp_send(skb); - - scp->persist = dn_nsp_persist(sk); - scp->persist_fxn = dn_nsp_retrans_conn_conf; -} - - -static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, - unsigned short reason, gfp_t gfp, - struct dst_entry *dst, - int ddl, unsigned char *dd, __le16 rem, __le16 loc) -{ - struct sk_buff *skb = NULL; - int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0); - unsigned char *msg; - - if ((dst == NULL) || (rem == 0)) { - net_dbg_ratelimited("DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n", - le16_to_cpu(rem), dst); - return; - } - - if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL) - return; - - msg = skb_put(skb, size); - *msg++ = msgflg; - *(__le16 *)msg = rem; - msg += 2; - *(__le16 *)msg = loc; - msg += 2; - *(__le16 *)msg = cpu_to_le16(reason); - msg += 2; - if (msgflg == NSP_DISCINIT) - *msg++ = ddl; - - if (ddl) { - memcpy(msg, dd, ddl); - } - - /* - * This doesn't go via the dn_nsp_send() function since we need - * to be able to send disc packets out which have no socket - * associations. - */ - skb_dst_set(skb, dst_clone(dst)); - dst_output(&init_net, skb->sk, skb); -} - - -void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg, - unsigned short reason, gfp_t gfp) -{ - struct dn_scp *scp = DN_SK(sk); - int ddl = 0; - - if (msgflg == NSP_DISCINIT) - ddl = le16_to_cpu(scp->discdata_out.opt_optl); - - if (reason == 0) - reason = le16_to_cpu(scp->discdata_out.opt_status); - - dn_nsp_do_disc(sk, msgflg, reason, gfp, __sk_dst_get(sk), ddl, - scp->discdata_out.opt_data, scp->addrrem, scp->addrloc); -} - - -void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg, - unsigned short reason) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - int ddl = 0; - gfp_t gfp = GFP_ATOMIC; - - dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb_dst(skb), ddl, - NULL, cb->src_port, cb->dst_port); -} - - -void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval) -{ - struct dn_scp *scp = DN_SK(sk); - struct sk_buff *skb; - unsigned char *ptr; - gfp_t gfp = GFP_ATOMIC; - - if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL) - return; - - skb_reserve(skb, DN_MAX_NSP_DATA_HEADER); - ptr = skb_put(skb, 2); - DN_SKB_CB(skb)->nsp_flags = 0x10; - *ptr++ = lsflags; - *ptr = fcval; - - dn_nsp_queue_xmit(sk, skb, gfp, 1); - - scp->persist = dn_nsp_persist(sk); - scp->persist_fxn = dn_nsp_xmit_timeout; -} - -static int dn_nsp_retrans_conninit(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - if (scp->state == DN_CI) - dn_nsp_send_conninit(sk, NSP_RCI); - - return 0; -} - -void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) -{ - struct dn_scp *scp = DN_SK(sk); - struct nsp_conn_init_msg *msg; - unsigned char aux; - unsigned char menuver; - struct dn_skb_cb *cb; - unsigned char type = 1; - gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC; - struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation); - - if (!skb) - return; - - cb = DN_SKB_CB(skb); - msg = skb_put(skb, sizeof(*msg)); - - msg->msgflg = msgflg; - msg->dstaddr = 0x0000; /* Remote Node will assign it*/ - - msg->srcaddr = scp->addrloc; - msg->services = scp->services_loc; /* Requested flow control */ - msg->info = scp->info_loc; /* Version Number */ - msg->segsize = cpu_to_le16(scp->segsize_loc); /* Max segment size */ - - if (scp->peer.sdn_objnum) - type = 0; - - skb_put(skb, dn_sockaddr2username(&scp->peer, - skb_tail_pointer(skb), type)); - skb_put(skb, dn_sockaddr2username(&scp->addr, - skb_tail_pointer(skb), 2)); - - menuver = DN_MENUVER_ACC | DN_MENUVER_USR; - if (scp->peer.sdn_flags & SDF_PROXY) - menuver |= DN_MENUVER_PRX; - if (scp->peer.sdn_flags & SDF_UICPROXY) - menuver |= DN_MENUVER_UIC; - - skb_put_u8(skb, menuver); /* Menu Version */ - - aux = scp->accessdata.acc_userl; - skb_put_u8(skb, aux); - if (aux > 0) - skb_put_data(skb, scp->accessdata.acc_user, aux); - - aux = scp->accessdata.acc_passl; - skb_put_u8(skb, aux); - if (aux > 0) - skb_put_data(skb, scp->accessdata.acc_pass, aux); - - aux = scp->accessdata.acc_accl; - skb_put_u8(skb, aux); - if (aux > 0) - skb_put_data(skb, scp->accessdata.acc_acc, aux); - - aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); - skb_put_u8(skb, aux); - if (aux > 0) - skb_put_data(skb, scp->conndata_out.opt_data, aux); - - scp->persist = dn_nsp_persist(sk); - scp->persist_fxn = dn_nsp_retrans_conninit; - - cb->rt_flags = DN_RT_F_RQR; - - dn_nsp_send(skb); -} diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c deleted file mode 100644 index 08c3dc45f1a4..000000000000 --- a/net/decnet/dn_route.c +++ /dev/null @@ -1,1921 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Routing Functions (Endnode and Router) - * - * Authors: Steve Whitehouse <SteveW@ACM.org> - * Eduardo Marcelo Serrat <emserrat@geocities.com> - * - * Changes: - * Steve Whitehouse : Fixes to allow "intra-ethernet" and - * "return-to-sender" bits on outgoing - * packets. - * Steve Whitehouse : Timeouts for cached routes. - * Steve Whitehouse : Use dst cache for input routes too. - * Steve Whitehouse : Fixed error values in dn_send_skb. - * Steve Whitehouse : Rework routing functions to better fit - * DECnet routing design - * Alexey Kuznetsov : New SMP locking - * Steve Whitehouse : More SMP locking changes & dn_cache_dump() - * Steve Whitehouse : Prerouting NF hook, now really is prerouting. - * Fixed possible skb leak in rtnetlink funcs. - * Steve Whitehouse : Dave Miller's dynamic hash table sizing and - * Alexey Kuznetsov's finer grained locking - * from ipv4/route.c. - * Steve Whitehouse : Routing is now starting to look like a - * sensible set of code now, mainly due to - * my copying the IPv4 routing code. The - * hooks here are modified and will continue - * to evolve for a while. - * Steve Whitehouse : Real SMP at last :-) Also new netfilter - * stuff. Look out raw sockets your days - * are numbered! - * Steve Whitehouse : Added return-to-sender functions. Added - * backlog congestion level return codes. - * Steve Whitehouse : Fixed bug where routes were set up with - * no ref count on net devices. - * Steve Whitehouse : RCU for the route cache - * Steve Whitehouse : Preparations for the flow cache - * Steve Whitehouse : Prepare for nonlinear skbs - */ - -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - -*******************************************************************************/ - -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/kernel.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/inet.h> -#include <linux/route.h> -#include <linux/in_route.h> -#include <linux/slab.h> -#include <net/sock.h> -#include <linux/mm.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/init.h> -#include <linux/rtnetlink.h> -#include <linux/string.h> -#include <linux/netfilter_decnet.h> -#include <linux/rcupdate.h> -#include <linux/times.h> -#include <linux/export.h> -#include <asm/errno.h> -#include <net/net_namespace.h> -#include <net/netlink.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/fib_rules.h> -#include <net/dn.h> -#include <net/dn_dev.h> -#include <net/dn_nsp.h> -#include <net/dn_route.h> -#include <net/dn_neigh.h> -#include <net/dn_fib.h> - -struct dn_rt_hash_bucket -{ - struct dn_route __rcu *chain; - spinlock_t lock; -}; - -extern struct neigh_table dn_neigh_table; - - -static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00}; - -static const int dn_rt_min_delay = 2 * HZ; -static const int dn_rt_max_delay = 10 * HZ; -static const int dn_rt_mtu_expires = 10 * 60 * HZ; - -static unsigned long dn_rt_deadline; - -static int dn_dst_gc(struct dst_ops *ops); -static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); -static unsigned int dn_dst_default_advmss(const struct dst_entry *dst); -static unsigned int dn_dst_mtu(const struct dst_entry *dst); -static void dn_dst_destroy(struct dst_entry *); -static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); -static void dn_dst_link_failure(struct sk_buff *); -static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb , u32 mtu, - bool confirm_neigh); -static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb); -static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr); -static int dn_route_input(struct sk_buff *); -static void dn_run_flush(struct timer_list *unused); - -static struct dn_rt_hash_bucket *dn_rt_hash_table; -static unsigned int dn_rt_hash_mask; - -static struct timer_list dn_route_timer; -static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush); -int decnet_dst_gc_interval = 2; - -static struct dst_ops dn_dst_ops = { - .family = PF_DECnet, - .gc_thresh = 128, - .gc = dn_dst_gc, - .check = dn_dst_check, - .default_advmss = dn_dst_default_advmss, - .mtu = dn_dst_mtu, - .cow_metrics = dst_cow_metrics_generic, - .destroy = dn_dst_destroy, - .ifdown = dn_dst_ifdown, - .negative_advice = dn_dst_negative_advice, - .link_failure = dn_dst_link_failure, - .update_pmtu = dn_dst_update_pmtu, - .redirect = dn_dst_redirect, - .neigh_lookup = dn_dst_neigh_lookup, -}; - -static void dn_dst_destroy(struct dst_entry *dst) -{ - struct dn_route *rt = (struct dn_route *) dst; - - if (rt->n) - neigh_release(rt->n); - dst_destroy_metrics_generic(dst); -} - -static void dn_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) -{ - if (how) { - struct dn_route *rt = (struct dn_route *) dst; - struct neighbour *n = rt->n; - - if (n && n->dev == dev) { - n->dev = dev_net(dev)->loopback_dev; - dev_hold(n->dev); - dev_put(dev); - } - } -} - -static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) -{ - __u16 tmp = (__u16 __force)(src ^ dst); - tmp ^= (tmp >> 3); - tmp ^= (tmp >> 5); - tmp ^= (tmp >> 10); - return dn_rt_hash_mask & (unsigned int)tmp; -} - -static void dn_dst_check_expire(struct timer_list *unused) -{ - int i; - struct dn_route *rt; - struct dn_route __rcu **rtp; - unsigned long now = jiffies; - unsigned long expire = 120 * HZ; - - for (i = 0; i <= dn_rt_hash_mask; i++) { - rtp = &dn_rt_hash_table[i].chain; - - spin_lock(&dn_rt_hash_table[i].lock); - while ((rt = rcu_dereference_protected(*rtp, - lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { - if (atomic_read(&rt->dst.__refcnt) > 1 || - (now - rt->dst.lastuse) < expire) { - rtp = &rt->dn_next; - continue; - } - *rtp = rt->dn_next; - rt->dn_next = NULL; - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - } - spin_unlock(&dn_rt_hash_table[i].lock); - - if ((jiffies - now) > 0) - break; - } - - mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ); -} - -static int dn_dst_gc(struct dst_ops *ops) -{ - struct dn_route *rt; - struct dn_route __rcu **rtp; - int i; - unsigned long now = jiffies; - unsigned long expire = 10 * HZ; - - for (i = 0; i <= dn_rt_hash_mask; i++) { - - spin_lock_bh(&dn_rt_hash_table[i].lock); - rtp = &dn_rt_hash_table[i].chain; - - while ((rt = rcu_dereference_protected(*rtp, - lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { - if (atomic_read(&rt->dst.__refcnt) > 1 || - (now - rt->dst.lastuse) < expire) { - rtp = &rt->dn_next; - continue; - } - *rtp = rt->dn_next; - rt->dn_next = NULL; - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - break; - } - spin_unlock_bh(&dn_rt_hash_table[i].lock); - } - - return 0; -} - -/* - * The decnet standards don't impose a particular minimum mtu, what they - * do insist on is that the routing layer accepts a datagram of at least - * 230 bytes long. Here we have to subtract the routing header length from - * 230 to get the minimum acceptable mtu. If there is no neighbour, then we - * assume the worst and use a long header size. - * - * We update both the mtu and the advertised mss (i.e. the segment size we - * advertise to the other end). - */ -static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu, - bool confirm_neigh) -{ - struct dn_route *rt = (struct dn_route *) dst; - struct neighbour *n = rt->n; - u32 min_mtu = 230; - struct dn_dev *dn; - - dn = n ? rcu_dereference_raw(n->dev->dn_ptr) : NULL; - - if (dn && dn->use_long == 0) - min_mtu -= 6; - else - min_mtu -= 21; - - if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) { - if (!(dst_metric_locked(dst, RTAX_MTU))) { - dst_metric_set(dst, RTAX_MTU, mtu); - dst_set_expires(dst, dn_rt_mtu_expires); - } - if (!(dst_metric_locked(dst, RTAX_ADVMSS))) { - u32 mss = mtu - DN_MAX_NSP_DATA_HEADER; - u32 existing_mss = dst_metric_raw(dst, RTAX_ADVMSS); - if (!existing_mss || existing_mss > mss) - dst_metric_set(dst, RTAX_ADVMSS, mss); - } - } -} - -static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -/* - * When a route has been marked obsolete. (e.g. routing cache flush) - */ -static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie) -{ - return NULL; -} - -static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst) -{ - dst_release(dst); - return NULL; -} - -static void dn_dst_link_failure(struct sk_buff *skb) -{ -} - -static inline int compare_keys(struct flowidn *fl1, struct flowidn *fl2) -{ - return ((fl1->daddr ^ fl2->daddr) | - (fl1->saddr ^ fl2->saddr) | - (fl1->flowidn_mark ^ fl2->flowidn_mark) | - (fl1->flowidn_scope ^ fl2->flowidn_scope) | - (fl1->flowidn_oif ^ fl2->flowidn_oif) | - (fl1->flowidn_iif ^ fl2->flowidn_iif)) == 0; -} - -static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_route **rp) -{ - struct dn_route *rth; - struct dn_route __rcu **rthp; - unsigned long now = jiffies; - - rthp = &dn_rt_hash_table[hash].chain; - - spin_lock_bh(&dn_rt_hash_table[hash].lock); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) { - if (compare_keys(&rth->fld, &rt->fld)) { - /* Put it first */ - *rthp = rth->dn_next; - rcu_assign_pointer(rth->dn_next, - dn_rt_hash_table[hash].chain); - rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); - - dst_hold_and_use(&rth->dst, now); - spin_unlock_bh(&dn_rt_hash_table[hash].lock); - - dst_release_immediate(&rt->dst); - *rp = rth; - return 0; - } - rthp = &rth->dn_next; - } - - rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain); - rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); - - dst_hold_and_use(&rt->dst, now); - spin_unlock_bh(&dn_rt_hash_table[hash].lock); - *rp = rt; - return 0; -} - -static void dn_run_flush(struct timer_list *unused) -{ - int i; - struct dn_route *rt, *next; - - for (i = 0; i < dn_rt_hash_mask; i++) { - spin_lock_bh(&dn_rt_hash_table[i].lock); - - if ((rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL)) == NULL) - goto nothing_to_declare; - - for(; rt; rt = next) { - next = rcu_dereference_raw(rt->dn_next); - RCU_INIT_POINTER(rt->dn_next, NULL); - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - } - -nothing_to_declare: - spin_unlock_bh(&dn_rt_hash_table[i].lock); - } -} - -static DEFINE_SPINLOCK(dn_rt_flush_lock); - -void dn_rt_cache_flush(int delay) -{ - unsigned long now = jiffies; - int user_mode = !in_interrupt(); - - if (delay < 0) - delay = dn_rt_min_delay; - - spin_lock_bh(&dn_rt_flush_lock); - - if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) { - long tmo = (long)(dn_rt_deadline - now); - - if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay) - tmo = 0; - - if (delay > tmo) - delay = tmo; - } - - if (delay <= 0) { - spin_unlock_bh(&dn_rt_flush_lock); - dn_run_flush(NULL); - return; - } - - if (dn_rt_deadline == 0) - dn_rt_deadline = now + dn_rt_max_delay; - - dn_rt_flush_timer.expires = now + delay; - add_timer(&dn_rt_flush_timer); - spin_unlock_bh(&dn_rt_flush_lock); -} - -/** - * dn_return_short - Return a short packet to its sender - * @skb: The packet to return - * - */ -static int dn_return_short(struct sk_buff *skb) -{ - struct dn_skb_cb *cb; - unsigned char *ptr; - __le16 *src; - __le16 *dst; - - /* Add back headers */ - skb_push(skb, skb->data - skb_network_header(skb)); - - if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) - return NET_RX_DROP; - - cb = DN_SKB_CB(skb); - /* Skip packet length and point to flags */ - ptr = skb->data + 2; - *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS; - - dst = (__le16 *)ptr; - ptr += 2; - src = (__le16 *)ptr; - ptr += 2; - *ptr = 0; /* Zero hop count */ - - swap(*src, *dst); - - skb->pkt_type = PACKET_OUTGOING; - dn_rt_finish_output(skb, NULL, NULL); - return NET_RX_SUCCESS; -} - -/** - * dn_return_long - Return a long packet to its sender - * @skb: The long format packet to return - * - */ -static int dn_return_long(struct sk_buff *skb) -{ - struct dn_skb_cb *cb; - unsigned char *ptr; - unsigned char *src_addr, *dst_addr; - unsigned char tmp[ETH_ALEN]; - - /* Add back all headers */ - skb_push(skb, skb->data - skb_network_header(skb)); - - if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) - return NET_RX_DROP; - - cb = DN_SKB_CB(skb); - /* Ignore packet length and point to flags */ - ptr = skb->data + 2; - - /* Skip padding */ - if (*ptr & DN_RT_F_PF) { - char padlen = (*ptr & ~DN_RT_F_PF); - ptr += padlen; - } - - *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS; - ptr += 2; - dst_addr = ptr; - ptr += 8; - src_addr = ptr; - ptr += 6; - *ptr = 0; /* Zero hop count */ - - /* Swap source and destination */ - memcpy(tmp, src_addr, ETH_ALEN); - memcpy(src_addr, dst_addr, ETH_ALEN); - memcpy(dst_addr, tmp, ETH_ALEN); - - skb->pkt_type = PACKET_OUTGOING; - dn_rt_finish_output(skb, dst_addr, src_addr); - return NET_RX_SUCCESS; -} - -/** - * dn_route_rx_packet - Try and find a route for an incoming packet - * @skb: The packet to find a route for - * - * Returns: result of input function if route is found, error code otherwise - */ -static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - struct dn_skb_cb *cb; - int err; - - if ((err = dn_route_input(skb)) == 0) - return dst_input(skb); - - cb = DN_SKB_CB(skb); - if (decnet_debug_level & 4) { - char *devname = skb->dev ? skb->dev->name : "???"; - - printk(KERN_DEBUG - "DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n", - (int)cb->rt_flags, devname, skb->len, - le16_to_cpu(cb->src), le16_to_cpu(cb->dst), - err, skb->pkt_type); - } - - if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) { - switch (cb->rt_flags & DN_RT_PKT_MSK) { - case DN_RT_PKT_SHORT: - return dn_return_short(skb); - case DN_RT_PKT_LONG: - return dn_return_long(skb); - } - } - - kfree_skb(skb); - return NET_RX_DROP; -} - -static int dn_route_rx_long(struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - unsigned char *ptr = skb->data; - - if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */ - goto drop_it; - - skb_pull(skb, 20); - skb_reset_transport_header(skb); - - /* Destination info */ - ptr += 2; - cb->dst = dn_eth2dn(ptr); - if (memcmp(ptr, dn_hiord_addr, 4) != 0) - goto drop_it; - ptr += 6; - - - /* Source info */ - ptr += 2; - cb->src = dn_eth2dn(ptr); - if (memcmp(ptr, dn_hiord_addr, 4) != 0) - goto drop_it; - ptr += 6; - /* Other junk */ - ptr++; - cb->hops = *ptr++; /* Visit Count */ - - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, - &init_net, NULL, skb, skb->dev, NULL, - dn_route_rx_packet); - -drop_it: - kfree_skb(skb); - return NET_RX_DROP; -} - - - -static int dn_route_rx_short(struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - unsigned char *ptr = skb->data; - - if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */ - goto drop_it; - - skb_pull(skb, 5); - skb_reset_transport_header(skb); - - cb->dst = *(__le16 *)ptr; - ptr += 2; - cb->src = *(__le16 *)ptr; - ptr += 2; - cb->hops = *ptr & 0x3f; - - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, - &init_net, NULL, skb, skb->dev, NULL, - dn_route_rx_packet); - -drop_it: - kfree_skb(skb); - return NET_RX_DROP; -} - -static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - /* - * I know we drop the packet here, but thats considered success in - * this case - */ - kfree_skb(skb); - return NET_RX_SUCCESS; -} - -static int dn_route_ptp_hello(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - dn_dev_hello(skb); - dn_neigh_pointopoint_hello(skb); - return NET_RX_SUCCESS; -} - -int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) -{ - struct dn_skb_cb *cb; - unsigned char flags = 0; - __u16 len = le16_to_cpu(*(__le16 *)skb->data); - struct dn_dev *dn = rcu_dereference(dev->dn_ptr); - unsigned char padlen = 0; - - if (!net_eq(dev_net(dev), &init_net)) - goto dump_it; - - if (dn == NULL) - goto dump_it; - - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) - goto out; - - if (!pskb_may_pull(skb, 3)) - goto dump_it; - - skb_pull(skb, 2); - - if (len > skb->len) - goto dump_it; - - skb_trim(skb, len); - - flags = *skb->data; - - cb = DN_SKB_CB(skb); - cb->stamp = jiffies; - cb->iif = dev->ifindex; - - /* - * If we have padding, remove it. - */ - if (flags & DN_RT_F_PF) { - padlen = flags & ~DN_RT_F_PF; - if (!pskb_may_pull(skb, padlen + 1)) - goto dump_it; - skb_pull(skb, padlen); - flags = *skb->data; - } - - skb_reset_network_header(skb); - - /* - * Weed out future version DECnet - */ - if (flags & DN_RT_F_VER) - goto dump_it; - - cb->rt_flags = flags; - - if (decnet_debug_level & 1) - printk(KERN_DEBUG - "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n", - (int)flags, (dev) ? dev->name : "???", len, skb->len, - padlen); - - if (flags & DN_RT_PKT_CNTL) { - if (unlikely(skb_linearize(skb))) - goto dump_it; - - switch (flags & DN_RT_CNTL_MSK) { - case DN_RT_PKT_INIT: - dn_dev_init_pkt(skb); - break; - case DN_RT_PKT_VERI: - dn_dev_veri_pkt(skb); - break; - } - - if (dn->parms.state != DN_DEV_S_RU) - goto dump_it; - - switch (flags & DN_RT_CNTL_MSK) { - case DN_RT_PKT_HELO: - return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - &init_net, NULL, skb, skb->dev, NULL, - dn_route_ptp_hello); - - case DN_RT_PKT_L1RT: - case DN_RT_PKT_L2RT: - return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE, - &init_net, NULL, skb, skb->dev, NULL, - dn_route_discard); - case DN_RT_PKT_ERTH: - return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - &init_net, NULL, skb, skb->dev, NULL, - dn_neigh_router_hello); - - case DN_RT_PKT_EEDH: - return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - &init_net, NULL, skb, skb->dev, NULL, - dn_neigh_endnode_hello); - } - } else { - if (dn->parms.state != DN_DEV_S_RU) - goto dump_it; - - skb_pull(skb, 1); /* Pull flags */ - - switch (flags & DN_RT_PKT_MSK) { - case DN_RT_PKT_LONG: - return dn_route_rx_long(skb); - case DN_RT_PKT_SHORT: - return dn_route_rx_short(skb); - } - } - -dump_it: - kfree_skb(skb); -out: - return NET_RX_DROP; -} - -static int dn_output(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct dn_route *rt = (struct dn_route *)dst; - struct net_device *dev = dst->dev; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - - int err = -EINVAL; - - if (rt->n == NULL) - goto error; - - skb->dev = dev; - - cb->src = rt->rt_saddr; - cb->dst = rt->rt_daddr; - - /* - * Always set the Intra-Ethernet bit on all outgoing packets - * originated on this node. Only valid flag from upper layers - * is return-to-sender-requested. Set hop count to 0 too. - */ - cb->rt_flags &= ~DN_RT_F_RQR; - cb->rt_flags |= DN_RT_F_IE; - cb->hops = 0; - - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, - &init_net, sk, skb, NULL, dev, - dn_to_neigh_output); - -error: - net_dbg_ratelimited("dn_output: This should not happen\n"); - - kfree_skb(skb); - - return err; -} - -static int dn_forward(struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct dst_entry *dst = skb_dst(skb); - struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr); - struct dn_route *rt; - int header_len; - struct net_device *dev = skb->dev; - - if (skb->pkt_type != PACKET_HOST) - goto drop; - - /* Ensure that we have enough space for headers */ - rt = (struct dn_route *)skb_dst(skb); - header_len = dn_db->use_long ? 21 : 6; - if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len)) - goto drop; - - /* - * Hop count exceeded. - */ - if (++cb->hops > 30) - goto drop; - - skb->dev = rt->dst.dev; - - /* - * If packet goes out same interface it came in on, then set - * the Intra-Ethernet bit. This has no effect for short - * packets, so we don't need to test for them here. - */ - cb->rt_flags &= ~DN_RT_F_IE; - if (rt->rt_flags & RTCF_DOREDIRECT) - cb->rt_flags |= DN_RT_F_IE; - - return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, - &init_net, NULL, skb, dev, skb->dev, - dn_to_neigh_output); - -drop: - kfree_skb(skb); - return NET_RX_DROP; -} - -/* - * Used to catch bugs. This should never normally get - * called. - */ -static int dn_rt_bug_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - - net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n", - le16_to_cpu(cb->src), le16_to_cpu(cb->dst)); - - kfree_skb(skb); - - return NET_RX_DROP; -} - -static int dn_rt_bug(struct sk_buff *skb) -{ - struct dn_skb_cb *cb = DN_SKB_CB(skb); - - net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n", - le16_to_cpu(cb->src), le16_to_cpu(cb->dst)); - - kfree_skb(skb); - - return NET_RX_DROP; -} - -static unsigned int dn_dst_default_advmss(const struct dst_entry *dst) -{ - return dn_mss_from_pmtu(dst->dev, dst_mtu(dst)); -} - -static unsigned int dn_dst_mtu(const struct dst_entry *dst) -{ - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - - return mtu ? : dst->dev->mtu; -} - -static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) -{ - return __neigh_lookup_errno(&dn_neigh_table, daddr, dst->dev); -} - -static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) -{ - struct dn_fib_info *fi = res->fi; - struct net_device *dev = rt->dst.dev; - unsigned int mss_metric; - struct neighbour *n; - - if (fi) { - if (DN_FIB_RES_GW(*res) && - DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = DN_FIB_RES_GW(*res); - dst_init_metrics(&rt->dst, fi->fib_metrics, true); - } - rt->rt_type = res->type; - - if (dev != NULL && rt->n == NULL) { - n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev); - if (IS_ERR(n)) - return PTR_ERR(n); - rt->n = n; - } - - if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu) - dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu); - mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS); - if (mss_metric) { - unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst)); - if (mss_metric > mss) - dst_metric_set(&rt->dst, RTAX_ADVMSS, mss); - } - return 0; -} - -static inline int dn_match_addr(__le16 addr1, __le16 addr2) -{ - __u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2); - int match = 16; - while(tmp) { - tmp >>= 1; - match--; - } - return match; -} - -static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope) -{ - __le16 saddr = 0; - struct dn_dev *dn_db; - struct dn_ifaddr *ifa; - int best_match = 0; - int ret; - - rcu_read_lock(); - dn_db = rcu_dereference(dev->dn_ptr); - for (ifa = rcu_dereference(dn_db->ifa_list); - ifa != NULL; - ifa = rcu_dereference(ifa->ifa_next)) { - if (ifa->ifa_scope > scope) - continue; - if (!daddr) { - saddr = ifa->ifa_local; - break; - } - ret = dn_match_addr(daddr, ifa->ifa_local); - if (ret > best_match) - saddr = ifa->ifa_local; - if (best_match == 0) - saddr = ifa->ifa_local; - } - rcu_read_unlock(); - - return saddr; -} - -static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res) -{ - return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope); -} - -static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res) -{ - __le16 mask = dnet_make_mask(res->prefixlen); - return (daddr&~mask)|res->fi->fib_nh->nh_gw; -} - -static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *oldflp, int try_hard) -{ - struct flowidn fld = { - .daddr = oldflp->daddr, - .saddr = oldflp->saddr, - .flowidn_scope = RT_SCOPE_UNIVERSE, - .flowidn_mark = oldflp->flowidn_mark, - .flowidn_iif = LOOPBACK_IFINDEX, - .flowidn_oif = oldflp->flowidn_oif, - }; - struct dn_route *rt = NULL; - struct net_device *dev_out = NULL, *dev; - struct neighbour *neigh = NULL; - unsigned int hash; - unsigned int flags = 0; - struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST }; - int err; - int free_res = 0; - __le16 gateway = 0; - - if (decnet_debug_level & 16) - printk(KERN_DEBUG - "dn_route_output_slow: dst=%04x src=%04x mark=%d" - " iif=%d oif=%d\n", le16_to_cpu(oldflp->daddr), - le16_to_cpu(oldflp->saddr), - oldflp->flowidn_mark, LOOPBACK_IFINDEX, - oldflp->flowidn_oif); - - /* If we have an output interface, verify its a DECnet device */ - if (oldflp->flowidn_oif) { - dev_out = dev_get_by_index(&init_net, oldflp->flowidn_oif); - err = -ENODEV; - if (dev_out && dev_out->dn_ptr == NULL) { - dev_put(dev_out); - dev_out = NULL; - } - if (dev_out == NULL) - goto out; - } - - /* If we have a source address, verify that its a local address */ - if (oldflp->saddr) { - err = -EADDRNOTAVAIL; - - if (dev_out) { - if (dn_dev_islocal(dev_out, oldflp->saddr)) - goto source_ok; - dev_put(dev_out); - goto out; - } - rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) { - if (!dev->dn_ptr) - continue; - if (!dn_dev_islocal(dev, oldflp->saddr)) - continue; - if ((dev->flags & IFF_LOOPBACK) && - oldflp->daddr && - !dn_dev_islocal(dev, oldflp->daddr)) - continue; - - dev_out = dev; - break; - } - rcu_read_unlock(); - if (dev_out == NULL) - goto out; - dev_hold(dev_out); -source_ok: - ; - } - - /* No destination? Assume its local */ - if (!fld.daddr) { - fld.daddr = fld.saddr; - - if (dev_out) - dev_put(dev_out); - err = -EINVAL; - dev_out = init_net.loopback_dev; - if (!dev_out->dn_ptr) - goto out; - err = -EADDRNOTAVAIL; - dev_hold(dev_out); - if (!fld.daddr) { - fld.daddr = - fld.saddr = dnet_select_source(dev_out, 0, - RT_SCOPE_HOST); - if (!fld.daddr) - goto out; - } - fld.flowidn_oif = LOOPBACK_IFINDEX; - res.type = RTN_LOCAL; - goto make_route; - } - - if (decnet_debug_level & 16) - printk(KERN_DEBUG - "dn_route_output_slow: initial checks complete." - " dst=%04x src=%04x oif=%d try_hard=%d\n", - le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr), - fld.flowidn_oif, try_hard); - - /* - * N.B. If the kernel is compiled without router support then - * dn_fib_lookup() will evaluate to non-zero so this if () block - * will always be executed. - */ - err = -ESRCH; - if (try_hard || (err = dn_fib_lookup(&fld, &res)) != 0) { - struct dn_dev *dn_db; - if (err != -ESRCH) - goto out; - /* - * Here the fallback is basically the standard algorithm for - * routing in endnodes which is described in the DECnet routing - * docs - * - * If we are not trying hard, look in neighbour cache. - * The result is tested to ensure that if a specific output - * device/source address was requested, then we honour that - * here - */ - if (!try_hard) { - neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fld.daddr); - if (neigh) { - if ((oldflp->flowidn_oif && - (neigh->dev->ifindex != oldflp->flowidn_oif)) || - (oldflp->saddr && - (!dn_dev_islocal(neigh->dev, - oldflp->saddr)))) { - neigh_release(neigh); - neigh = NULL; - } else { - if (dev_out) - dev_put(dev_out); - if (dn_dev_islocal(neigh->dev, fld.daddr)) { - dev_out = init_net.loopback_dev; - res.type = RTN_LOCAL; - } else { - dev_out = neigh->dev; - } - dev_hold(dev_out); - goto select_source; - } - } - } - - /* Not there? Perhaps its a local address */ - if (dev_out == NULL) - dev_out = dn_dev_get_default(); - err = -ENODEV; - if (dev_out == NULL) - goto out; - dn_db = rcu_dereference_raw(dev_out->dn_ptr); - if (!dn_db) - goto e_inval; - /* Possible improvement - check all devices for local addr */ - if (dn_dev_islocal(dev_out, fld.daddr)) { - dev_put(dev_out); - dev_out = init_net.loopback_dev; - dev_hold(dev_out); - res.type = RTN_LOCAL; - goto select_source; - } - /* Not local either.... try sending it to the default router */ - neigh = neigh_clone(dn_db->router); - BUG_ON(neigh && neigh->dev != dev_out); - - /* Ok then, we assume its directly connected and move on */ -select_source: - if (neigh) - gateway = ((struct dn_neigh *)neigh)->addr; - if (gateway == 0) - gateway = fld.daddr; - if (fld.saddr == 0) { - fld.saddr = dnet_select_source(dev_out, gateway, - res.type == RTN_LOCAL ? - RT_SCOPE_HOST : - RT_SCOPE_LINK); - if (fld.saddr == 0 && res.type != RTN_LOCAL) - goto e_addr; - } - fld.flowidn_oif = dev_out->ifindex; - goto make_route; - } - free_res = 1; - - if (res.type == RTN_NAT) - goto e_inval; - - if (res.type == RTN_LOCAL) { - if (!fld.saddr) - fld.saddr = fld.daddr; - if (dev_out) - dev_put(dev_out); - dev_out = init_net.loopback_dev; - dev_hold(dev_out); - if (!dev_out->dn_ptr) - goto e_inval; - fld.flowidn_oif = dev_out->ifindex; - if (res.fi) - dn_fib_info_put(res.fi); - res.fi = NULL; - goto make_route; - } - - if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0) - dn_fib_select_multipath(&fld, &res); - - /* - * We could add some logic to deal with default routes here and - * get rid of some of the special casing above. - */ - - if (!fld.saddr) - fld.saddr = DN_FIB_RES_PREFSRC(res); - - if (dev_out) - dev_put(dev_out); - dev_out = DN_FIB_RES_DEV(res); - dev_hold(dev_out); - fld.flowidn_oif = dev_out->ifindex; - gateway = DN_FIB_RES_GW(res); - -make_route: - if (dev_out->flags & IFF_LOOPBACK) - flags |= RTCF_LOCAL; - - rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST); - if (rt == NULL) - goto e_nobufs; - - rt->dn_next = NULL; - memset(&rt->fld, 0, sizeof(rt->fld)); - rt->fld.saddr = oldflp->saddr; - rt->fld.daddr = oldflp->daddr; - rt->fld.flowidn_oif = oldflp->flowidn_oif; - rt->fld.flowidn_iif = 0; - rt->fld.flowidn_mark = oldflp->flowidn_mark; - - rt->rt_saddr = fld.saddr; - rt->rt_daddr = fld.daddr; - rt->rt_gateway = gateway ? gateway : fld.daddr; - rt->rt_local_src = fld.saddr; - - rt->rt_dst_map = fld.daddr; - rt->rt_src_map = fld.saddr; - - rt->n = neigh; - neigh = NULL; - - rt->dst.lastuse = jiffies; - rt->dst.output = dn_output; - rt->dst.input = dn_rt_bug; - rt->rt_flags = flags; - if (flags & RTCF_LOCAL) - rt->dst.input = dn_nsp_rx; - - err = dn_rt_set_next_hop(rt, &res); - if (err) - goto e_neighbour; - - hash = dn_hash(rt->fld.saddr, rt->fld.daddr); - /* dn_insert_route() increments dst->__refcnt */ - dn_insert_route(rt, hash, (struct dn_route **)pprt); - -done: - if (neigh) - neigh_release(neigh); - if (free_res) - dn_fib_res_put(&res); - if (dev_out) - dev_put(dev_out); -out: - return err; - -e_addr: - err = -EADDRNOTAVAIL; - goto done; -e_inval: - err = -EINVAL; - goto done; -e_nobufs: - err = -ENOBUFS; - goto done; -e_neighbour: - dst_release_immediate(&rt->dst); - goto e_nobufs; -} - - -/* - * N.B. The flags may be moved into the flowi at some future stage. - */ -static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *flp, int flags) -{ - unsigned int hash = dn_hash(flp->saddr, flp->daddr); - struct dn_route *rt = NULL; - - if (!(flags & MSG_TRYHARD)) { - rcu_read_lock_bh(); - for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt; - rt = rcu_dereference_bh(rt->dn_next)) { - if ((flp->daddr == rt->fld.daddr) && - (flp->saddr == rt->fld.saddr) && - (flp->flowidn_mark == rt->fld.flowidn_mark) && - dn_is_output_route(rt) && - (rt->fld.flowidn_oif == flp->flowidn_oif)) { - dst_hold_and_use(&rt->dst, jiffies); - rcu_read_unlock_bh(); - *pprt = &rt->dst; - return 0; - } - } - rcu_read_unlock_bh(); - } - - return dn_route_output_slow(pprt, flp, flags); -} - -static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int flags) -{ - int err; - - err = __dn_route_output_key(pprt, flp, flags); - if (err == 0 && flp->flowidn_proto) { - *pprt = xfrm_lookup(&init_net, *pprt, - flowidn_to_flowi(flp), NULL, 0); - if (IS_ERR(*pprt)) { - err = PTR_ERR(*pprt); - *pprt = NULL; - } - } - return err; -} - -int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *fl, struct sock *sk, int flags) -{ - int err; - - err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD); - if (err == 0 && fl->flowidn_proto) { - *pprt = xfrm_lookup(&init_net, *pprt, - flowidn_to_flowi(fl), sk, 0); - if (IS_ERR(*pprt)) { - err = PTR_ERR(*pprt); - *pprt = NULL; - } - } - return err; -} - -static int dn_route_input_slow(struct sk_buff *skb) -{ - struct dn_route *rt = NULL; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - struct net_device *in_dev = skb->dev; - struct net_device *out_dev = NULL; - struct dn_dev *dn_db; - struct neighbour *neigh = NULL; - unsigned int hash; - int flags = 0; - __le16 gateway = 0; - __le16 local_src = 0; - struct flowidn fld = { - .daddr = cb->dst, - .saddr = cb->src, - .flowidn_scope = RT_SCOPE_UNIVERSE, - .flowidn_mark = skb->mark, - .flowidn_iif = skb->dev->ifindex, - }; - struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE }; - int err = -EINVAL; - int free_res = 0; - - dev_hold(in_dev); - - if ((dn_db = rcu_dereference(in_dev->dn_ptr)) == NULL) - goto out; - - /* Zero source addresses are not allowed */ - if (fld.saddr == 0) - goto out; - - /* - * In this case we've just received a packet from a source - * outside ourselves pretending to come from us. We don't - * allow it any further to prevent routing loops, spoofing and - * other nasties. Loopback packets already have the dst attached - * so this only affects packets which have originated elsewhere. - */ - err = -ENOTUNIQ; - if (dn_dev_islocal(in_dev, cb->src)) - goto out; - - err = dn_fib_lookup(&fld, &res); - if (err) { - if (err != -ESRCH) - goto out; - /* - * Is the destination us ? - */ - if (!dn_dev_islocal(in_dev, cb->dst)) - goto e_inval; - - res.type = RTN_LOCAL; - } else { - __le16 src_map = fld.saddr; - free_res = 1; - - out_dev = DN_FIB_RES_DEV(res); - if (out_dev == NULL) { - net_crit_ratelimited("Bug in dn_route_input_slow() No output device\n"); - goto e_inval; - } - dev_hold(out_dev); - - if (res.r) - src_map = fld.saddr; /* no NAT support for now */ - - gateway = DN_FIB_RES_GW(res); - if (res.type == RTN_NAT) { - fld.daddr = dn_fib_rules_map_destination(fld.daddr, &res); - dn_fib_res_put(&res); - free_res = 0; - if (dn_fib_lookup(&fld, &res)) - goto e_inval; - free_res = 1; - if (res.type != RTN_UNICAST) - goto e_inval; - flags |= RTCF_DNAT; - gateway = fld.daddr; - } - fld.saddr = src_map; - } - - switch(res.type) { - case RTN_UNICAST: - /* - * Forwarding check here, we only check for forwarding - * being turned off, if you want to only forward intra - * area, its up to you to set the routing tables up - * correctly. - */ - if (dn_db->parms.forwarding == 0) - goto e_inval; - - if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0) - dn_fib_select_multipath(&fld, &res); - - /* - * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT - * flag as a hint to set the intra-ethernet bit when - * forwarding. If we've got NAT in operation, we don't do - * this optimisation. - */ - if (out_dev == in_dev && !(flags & RTCF_NAT)) - flags |= RTCF_DOREDIRECT; - - local_src = DN_FIB_RES_PREFSRC(res); - - case RTN_BLACKHOLE: - case RTN_UNREACHABLE: - break; - case RTN_LOCAL: - flags |= RTCF_LOCAL; - fld.saddr = cb->dst; - fld.daddr = cb->src; - - /* Routing tables gave us a gateway */ - if (gateway) - goto make_route; - - /* Packet was intra-ethernet, so we know its on-link */ - if (cb->rt_flags & DN_RT_F_IE) { - gateway = cb->src; - goto make_route; - } - - /* Use the default router if there is one */ - neigh = neigh_clone(dn_db->router); - if (neigh) { - gateway = ((struct dn_neigh *)neigh)->addr; - goto make_route; - } - - /* Close eyes and pray */ - gateway = cb->src; - goto make_route; - default: - goto e_inval; - } - -make_route: - rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST); - if (rt == NULL) - goto e_nobufs; - - rt->dn_next = NULL; - memset(&rt->fld, 0, sizeof(rt->fld)); - rt->rt_saddr = fld.saddr; - rt->rt_daddr = fld.daddr; - rt->rt_gateway = fld.daddr; - if (gateway) - rt->rt_gateway = gateway; - rt->rt_local_src = local_src ? local_src : rt->rt_saddr; - - rt->rt_dst_map = fld.daddr; - rt->rt_src_map = fld.saddr; - - rt->fld.saddr = cb->src; - rt->fld.daddr = cb->dst; - rt->fld.flowidn_oif = 0; - rt->fld.flowidn_iif = in_dev->ifindex; - rt->fld.flowidn_mark = fld.flowidn_mark; - - rt->n = neigh; - rt->dst.lastuse = jiffies; - rt->dst.output = dn_rt_bug_out; - switch (res.type) { - case RTN_UNICAST: - rt->dst.input = dn_forward; - break; - case RTN_LOCAL: - rt->dst.output = dn_output; - rt->dst.input = dn_nsp_rx; - rt->dst.dev = in_dev; - flags |= RTCF_LOCAL; - break; - default: - case RTN_UNREACHABLE: - case RTN_BLACKHOLE: - rt->dst.input = dst_discard; - } - rt->rt_flags = flags; - - err = dn_rt_set_next_hop(rt, &res); - if (err) - goto e_neighbour; - - hash = dn_hash(rt->fld.saddr, rt->fld.daddr); - /* dn_insert_route() increments dst->__refcnt */ - dn_insert_route(rt, hash, &rt); - skb_dst_set(skb, &rt->dst); - -done: - if (neigh) - neigh_release(neigh); - if (free_res) - dn_fib_res_put(&res); - dev_put(in_dev); - if (out_dev) - dev_put(out_dev); -out: - return err; - -e_inval: - err = -EINVAL; - goto done; - -e_nobufs: - err = -ENOBUFS; - goto done; - -e_neighbour: - dst_release_immediate(&rt->dst); - goto done; -} - -static int dn_route_input(struct sk_buff *skb) -{ - struct dn_route *rt; - struct dn_skb_cb *cb = DN_SKB_CB(skb); - unsigned int hash = dn_hash(cb->src, cb->dst); - - if (skb_dst(skb)) - return 0; - - rcu_read_lock(); - for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; - rt = rcu_dereference(rt->dn_next)) { - if ((rt->fld.saddr == cb->src) && - (rt->fld.daddr == cb->dst) && - (rt->fld.flowidn_oif == 0) && - (rt->fld.flowidn_mark == skb->mark) && - (rt->fld.flowidn_iif == cb->iif)) { - dst_hold_and_use(&rt->dst, jiffies); - rcu_read_unlock(); - skb_dst_set(skb, (struct dst_entry *)rt); - return 0; - } - } - rcu_read_unlock(); - - return dn_route_input_slow(skb); -} - -static int dn_rt_fill_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, int nowait, unsigned int flags) -{ - struct dn_route *rt = (struct dn_route *)skb_dst(skb); - struct rtmsg *r; - struct nlmsghdr *nlh; - long expires; - - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - r->rtm_family = AF_DECnet; - r->rtm_dst_len = 16; - r->rtm_src_len = 0; - r->rtm_tos = 0; - r->rtm_table = RT_TABLE_MAIN; - r->rtm_type = rt->rt_type; - r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; - r->rtm_scope = RT_SCOPE_UNIVERSE; - r->rtm_protocol = RTPROT_UNSPEC; - - if (rt->rt_flags & RTCF_NOTIFY) - r->rtm_flags |= RTM_F_NOTIFY; - - if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN) < 0 || - nla_put_le16(skb, RTA_DST, rt->rt_daddr) < 0) - goto errout; - - if (rt->fld.saddr) { - r->rtm_src_len = 16; - if (nla_put_le16(skb, RTA_SRC, rt->fld.saddr) < 0) - goto errout; - } - if (rt->dst.dev && - nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex) < 0) - goto errout; - - /* - * Note to self - change this if input routes reverse direction when - * they deal only with inputs and not with replies like they do - * currently. - */ - if (nla_put_le16(skb, RTA_PREFSRC, rt->rt_local_src) < 0) - goto errout; - - if (rt->rt_daddr != rt->rt_gateway && - nla_put_le16(skb, RTA_GATEWAY, rt->rt_gateway) < 0) - goto errout; - - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) - goto errout; - - expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, - rt->dst.error) < 0) - goto errout; - - if (dn_is_input_route(rt) && - nla_put_u32(skb, RTA_IIF, rt->fld.flowidn_iif) < 0) - goto errout; - - nlmsg_end(skb, nlh); - return 0; - -errout: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - -const struct nla_policy rtm_dn_policy[RTA_MAX + 1] = { - [RTA_DST] = { .type = NLA_U16 }, - [RTA_SRC] = { .type = NLA_U16 }, - [RTA_IIF] = { .type = NLA_U32 }, - [RTA_OIF] = { .type = NLA_U32 }, - [RTA_GATEWAY] = { .type = NLA_U16 }, - [RTA_PRIORITY] = { .type = NLA_U32 }, - [RTA_PREFSRC] = { .type = NLA_U16 }, - [RTA_METRICS] = { .type = NLA_NESTED }, - [RTA_MULTIPATH] = { .type = NLA_NESTED }, - [RTA_TABLE] = { .type = NLA_U32 }, - [RTA_MARK] = { .type = NLA_U32 }, -}; - -/* - * This is called by both endnodes and routers now. - */ -static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(in_skb->sk); - struct rtmsg *rtm = nlmsg_data(nlh); - struct dn_route *rt = NULL; - struct dn_skb_cb *cb; - int err; - struct sk_buff *skb; - struct flowidn fld; - struct nlattr *tb[RTA_MAX+1]; - - if (!net_eq(net, &init_net)) - return -EINVAL; - - err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_dn_policy, extack); - if (err < 0) - return err; - - memset(&fld, 0, sizeof(fld)); - fld.flowidn_proto = DNPROTO_NSP; - - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb == NULL) - return -ENOBUFS; - skb_reset_mac_header(skb); - cb = DN_SKB_CB(skb); - - if (tb[RTA_SRC]) - fld.saddr = nla_get_le16(tb[RTA_SRC]); - - if (tb[RTA_DST]) - fld.daddr = nla_get_le16(tb[RTA_DST]); - - if (tb[RTA_IIF]) - fld.flowidn_iif = nla_get_u32(tb[RTA_IIF]); - - if (fld.flowidn_iif) { - struct net_device *dev; - dev = __dev_get_by_index(&init_net, fld.flowidn_iif); - if (!dev || !dev->dn_ptr) { - kfree_skb(skb); - return -ENODEV; - } - skb->protocol = htons(ETH_P_DNA_RT); - skb->dev = dev; - cb->src = fld.saddr; - cb->dst = fld.daddr; - local_bh_disable(); - err = dn_route_input(skb); - local_bh_enable(); - memset(cb, 0, sizeof(struct dn_skb_cb)); - rt = (struct dn_route *)skb_dst(skb); - if (!err && -rt->dst.error) - err = rt->dst.error; - } else { - if (tb[RTA_OIF]) - fld.flowidn_oif = nla_get_u32(tb[RTA_OIF]); - - err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0); - } - - skb->dev = NULL; - if (err) - goto out_free; - skb_dst_set(skb, &rt->dst); - if (rtm->rtm_flags & RTM_F_NOTIFY) - rt->rt_flags |= RTCF_NOTIFY; - - err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); - if (err < 0) { - err = -EMSGSIZE; - goto out_free; - } - - return rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).portid); - -out_free: - kfree_skb(skb); - return err; -} - -/* - * For routers, this is called from dn_fib_dump, but for endnodes its - * called directly from the rtnetlink dispatch table. - */ -int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - struct dn_route *rt; - int h, s_h; - int idx, s_idx; - struct rtmsg *rtm; - - if (!net_eq(net, &init_net)) - return 0; - - if (nlmsg_len(cb->nlh) < sizeof(struct rtmsg)) - return -EINVAL; - - rtm = nlmsg_data(cb->nlh); - if (!(rtm->rtm_flags & RTM_F_CLONED)) - return 0; - - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - for(h = 0; h <= dn_rt_hash_mask; h++) { - if (h < s_h) - continue; - if (h > s_h) - s_idx = 0; - rcu_read_lock_bh(); - for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0; - rt; - rt = rcu_dereference_bh(rt->dn_next), idx++) { - if (idx < s_idx) - continue; - skb_dst_set(skb, dst_clone(&rt->dst)); - if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - 1, NLM_F_MULTI) < 0) { - skb_dst_drop(skb); - rcu_read_unlock_bh(); - goto done; - } - skb_dst_drop(skb); - } - rcu_read_unlock_bh(); - } - -done: - cb->args[0] = h; - cb->args[1] = idx; - return skb->len; -} - -#ifdef CONFIG_PROC_FS -struct dn_rt_cache_iter_state { - int bucket; -}; - -static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq) -{ - struct dn_route *rt = NULL; - struct dn_rt_cache_iter_state *s = seq->private; - - for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) { - rcu_read_lock_bh(); - rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain); - if (rt) - break; - rcu_read_unlock_bh(); - } - return rt; -} - -static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt) -{ - struct dn_rt_cache_iter_state *s = seq->private; - - rt = rcu_dereference_bh(rt->dn_next); - while (!rt) { - rcu_read_unlock_bh(); - if (--s->bucket < 0) - break; - rcu_read_lock_bh(); - rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain); - } - return rt; -} - -static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos) -{ - struct dn_route *rt = dn_rt_cache_get_first(seq); - - if (rt) { - while(*pos && (rt = dn_rt_cache_get_next(seq, rt))) - --*pos; - } - return *pos ? NULL : rt; -} - -static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct dn_route *rt = dn_rt_cache_get_next(seq, v); - ++*pos; - return rt; -} - -static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v) -{ - if (v) - rcu_read_unlock_bh(); -} - -static int dn_rt_cache_seq_show(struct seq_file *seq, void *v) -{ - struct dn_route *rt = v; - char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN]; - - seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n", - rt->dst.dev ? rt->dst.dev->name : "*", - dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), - dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), - atomic_read(&rt->dst.__refcnt), - rt->dst.__use, 0); - return 0; -} - -static const struct seq_operations dn_rt_cache_seq_ops = { - .start = dn_rt_cache_seq_start, - .next = dn_rt_cache_seq_next, - .stop = dn_rt_cache_seq_stop, - .show = dn_rt_cache_seq_show, -}; -#endif /* CONFIG_PROC_FS */ - -void __init dn_route_init(void) -{ - int i, goal, order; - - dn_dst_ops.kmem_cachep = - kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - dst_entries_init(&dn_dst_ops); - timer_setup(&dn_route_timer, dn_dst_check_expire, 0); - dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; - add_timer(&dn_route_timer); - - goal = totalram_pages() >> (26 - PAGE_SHIFT); - - for(order = 0; (1UL << order) < goal; order++) - /* NOTHING */; - - /* - * Only want 1024 entries max, since the table is very, very unlikely - * to be larger than that. - */ - while(order && ((((1UL << order) * PAGE_SIZE) / - sizeof(struct dn_rt_hash_bucket)) >= 2048)) - order--; - - do { - dn_rt_hash_mask = (1UL << order) * PAGE_SIZE / - sizeof(struct dn_rt_hash_bucket); - while(dn_rt_hash_mask & (dn_rt_hash_mask - 1)) - dn_rt_hash_mask--; - dn_rt_hash_table = (struct dn_rt_hash_bucket *) - __get_free_pages(GFP_ATOMIC, order); - } while (dn_rt_hash_table == NULL && --order > 0); - - if (!dn_rt_hash_table) - panic("Failed to allocate DECnet route cache hash table\n"); - - printk(KERN_INFO - "DECnet: Routing cache hash table of %u buckets, %ldKbytes\n", - dn_rt_hash_mask, - (long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024); - - dn_rt_hash_mask--; - for(i = 0; i <= dn_rt_hash_mask; i++) { - spin_lock_init(&dn_rt_hash_table[i].lock); - dn_rt_hash_table[i].chain = NULL; - } - - dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1); - - proc_create_seq_private("decnet_cache", 0444, init_net.proc_net, - &dn_rt_cache_seq_ops, - sizeof(struct dn_rt_cache_iter_state), NULL); - -#ifdef CONFIG_DECNET_ROUTER - rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE, - dn_cache_getroute, dn_fib_dump, 0); -#else - rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE, - dn_cache_getroute, dn_cache_dump, 0); -#endif -} - -void __exit dn_route_cleanup(void) -{ - del_timer(&dn_route_timer); - dn_run_flush(NULL); - - remove_proc_entry("decnet_cache", init_net.proc_net); - dst_entries_destroy(&dn_dst_ops); -} diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c deleted file mode 100644 index 4a4e3c17740c..000000000000 --- a/net/decnet/dn_rules.c +++ /dev/null @@ -1,258 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Routing Forwarding Information Base (Rules) - * - * Author: Steve Whitehouse <SteveW@ACM.org> - * Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c - * - * - * Changes: - * Steve Whitehouse <steve@chygwyn.com> - * Updated for Thomas Graf's generic rules - * - */ -#include <linux/net.h> -#include <linux/init.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> -#include <linux/netdevice.h> -#include <linux/spinlock.h> -#include <linux/list.h> -#include <linux/rcupdate.h> -#include <linux/export.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/fib_rules.h> -#include <net/dn.h> -#include <net/dn_fib.h> -#include <net/dn_neigh.h> -#include <net/dn_dev.h> -#include <net/dn_route.h> - -static struct fib_rules_ops *dn_fib_rules_ops; - -struct dn_fib_rule -{ - struct fib_rule common; - unsigned char dst_len; - unsigned char src_len; - __le16 src; - __le16 srcmask; - __le16 dst; - __le16 dstmask; - __le16 srcmap; - u8 flags; -}; - - -int dn_fib_lookup(struct flowidn *flp, struct dn_fib_res *res) -{ - struct fib_lookup_arg arg = { - .result = res, - }; - int err; - - err = fib_rules_lookup(dn_fib_rules_ops, - flowidn_to_flowi(flp), 0, &arg); - res->r = arg.rule; - - return err; -} - -static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) -{ - struct flowidn *fld = &flp->u.dn; - int err = -EAGAIN; - struct dn_fib_table *tbl; - - switch(rule->action) { - case FR_ACT_TO_TBL: - break; - - case FR_ACT_UNREACHABLE: - err = -ENETUNREACH; - goto errout; - - case FR_ACT_PROHIBIT: - err = -EACCES; - goto errout; - - case FR_ACT_BLACKHOLE: - default: - err = -EINVAL; - goto errout; - } - - tbl = dn_fib_get_table(rule->table, 0); - if (tbl == NULL) - goto errout; - - err = tbl->lookup(tbl, fld, (struct dn_fib_res *)arg->result); - if (err > 0) - err = -EAGAIN; -errout: - return err; -} - -static const struct nla_policy dn_fib_rule_policy[FRA_MAX+1] = { - FRA_GENERIC_POLICY, -}; - -static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) -{ - struct dn_fib_rule *r = (struct dn_fib_rule *)rule; - struct flowidn *fld = &fl->u.dn; - __le16 daddr = fld->daddr; - __le16 saddr = fld->saddr; - - if (((saddr ^ r->src) & r->srcmask) || - ((daddr ^ r->dst) & r->dstmask)) - return 0; - - return 1; -} - -static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb, - struct fib_rule_hdr *frh, - struct nlattr **tb, - struct netlink_ext_ack *extack) -{ - int err = -EINVAL; - struct dn_fib_rule *r = (struct dn_fib_rule *)rule; - - if (frh->tos) { - NL_SET_ERR_MSG(extack, "Invalid tos value"); - goto errout; - } - - if (rule->table == RT_TABLE_UNSPEC) { - if (rule->action == FR_ACT_TO_TBL) { - struct dn_fib_table *table; - - table = dn_fib_empty_table(); - if (table == NULL) { - err = -ENOBUFS; - goto errout; - } - - rule->table = table->n; - } - } - - if (frh->src_len) - r->src = nla_get_le16(tb[FRA_SRC]); - - if (frh->dst_len) - r->dst = nla_get_le16(tb[FRA_DST]); - - r->src_len = frh->src_len; - r->srcmask = dnet_make_mask(r->src_len); - r->dst_len = frh->dst_len; - r->dstmask = dnet_make_mask(r->dst_len); - err = 0; -errout: - return err; -} - -static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, - struct nlattr **tb) -{ - struct dn_fib_rule *r = (struct dn_fib_rule *)rule; - - if (frh->src_len && (r->src_len != frh->src_len)) - return 0; - - if (frh->dst_len && (r->dst_len != frh->dst_len)) - return 0; - - if (frh->src_len && (r->src != nla_get_le16(tb[FRA_SRC]))) - return 0; - - if (frh->dst_len && (r->dst != nla_get_le16(tb[FRA_DST]))) - return 0; - - return 1; -} - -unsigned int dnet_addr_type(__le16 addr) -{ - struct flowidn fld = { .daddr = addr }; - struct dn_fib_res res; - unsigned int ret = RTN_UNICAST; - struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0); - - res.r = NULL; - - if (tb) { - if (!tb->lookup(tb, &fld, &res)) { - ret = res.type; - dn_fib_res_put(&res); - } - } - return ret; -} - -static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb, - struct fib_rule_hdr *frh) -{ - struct dn_fib_rule *r = (struct dn_fib_rule *)rule; - - frh->dst_len = r->dst_len; - frh->src_len = r->src_len; - frh->tos = 0; - - if ((r->dst_len && - nla_put_le16(skb, FRA_DST, r->dst)) || - (r->src_len && - nla_put_le16(skb, FRA_SRC, r->src))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOBUFS; -} - -static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops) -{ - dn_rt_cache_flush(-1); -} - -static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = { - .family = AF_DECnet, - .rule_size = sizeof(struct dn_fib_rule), - .addr_size = sizeof(u16), - .action = dn_fib_rule_action, - .match = dn_fib_rule_match, - .configure = dn_fib_rule_configure, - .compare = dn_fib_rule_compare, - .fill = dn_fib_rule_fill, - .flush_cache = dn_fib_rule_flush_cache, - .nlgroup = RTNLGRP_DECnet_RULE, - .policy = dn_fib_rule_policy, - .owner = THIS_MODULE, - .fro_net = &init_net, -}; - -void __init dn_fib_rules_init(void) -{ - dn_fib_rules_ops = - fib_rules_register(&dn_fib_rules_ops_template, &init_net); - BUG_ON(IS_ERR(dn_fib_rules_ops)); - BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff, - RT_TABLE_MAIN, 0)); -} - -void __exit dn_fib_rules_cleanup(void) -{ - rtnl_lock(); - fib_rules_unregister(dn_fib_rules_ops); - rtnl_unlock(); - rcu_barrier(); -} diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c deleted file mode 100644 index 33fefb0aebca..000000000000 --- a/net/decnet/dn_table.c +++ /dev/null @@ -1,929 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Routing Forwarding Information Base (Routing Tables) - * - * Author: Steve Whitehouse <SteveW@ACM.org> - * Mostly copied from the IPv4 routing code - * - * - * Changes: - * - */ -#include <linux/string.h> -#include <linux/net.h> -#include <linux/socket.h> -#include <linux/slab.h> -#include <linux/sockios.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/netdevice.h> -#include <linux/timer.h> -#include <linux/spinlock.h> -#include <linux/atomic.h> -#include <linux/uaccess.h> -#include <linux/route.h> /* RTF_xxx */ -#include <net/neighbour.h> -#include <net/netlink.h> -#include <net/tcp.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/fib_rules.h> -#include <net/dn.h> -#include <net/dn_route.h> -#include <net/dn_fib.h> -#include <net/dn_neigh.h> -#include <net/dn_dev.h> - -struct dn_zone -{ - struct dn_zone *dz_next; - struct dn_fib_node **dz_hash; - int dz_nent; - int dz_divisor; - u32 dz_hashmask; -#define DZ_HASHMASK(dz) ((dz)->dz_hashmask) - int dz_order; - __le16 dz_mask; -#define DZ_MASK(dz) ((dz)->dz_mask) -}; - -struct dn_hash -{ - struct dn_zone *dh_zones[17]; - struct dn_zone *dh_zone_list; -}; - -#define dz_key_0(key) ((key).datum = 0) - -#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ - for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) - -#define endfor_nexthops(fi) } - -#define DN_MAX_DIVISOR 1024 -#define DN_S_ZOMBIE 1 -#define DN_S_ACCESSED 2 - -#define DN_FIB_SCAN(f, fp) \ -for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next) - -#define DN_FIB_SCAN_KEY(f, fp, key) \ -for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next) - -#define RT_TABLE_MIN 1 -#define DN_FIB_TABLE_HASHSZ 256 -static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ]; -static DEFINE_RWLOCK(dn_fib_tables_lock); - -static struct kmem_cache *dn_hash_kmem __read_mostly; -static int dn_fib_hash_zombies; - -static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz) -{ - u16 h = le16_to_cpu(key.datum)>>(16 - dz->dz_order); - h ^= (h >> 10); - h ^= (h >> 6); - h &= DZ_HASHMASK(dz); - return *(dn_fib_idx_t *)&h; -} - -static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz) -{ - dn_fib_key_t k; - k.datum = dst & DZ_MASK(dz); - return k; -} - -static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz) -{ - return &dz->dz_hash[dn_hash(key, dz).datum]; -} - -static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz) -{ - return dz->dz_hash[dn_hash(key, dz).datum]; -} - -static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b) -{ - return a.datum == b.datum; -} - -static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b) -{ - return a.datum <= b.datum; -} - -static inline void dn_rebuild_zone(struct dn_zone *dz, - struct dn_fib_node **old_ht, - int old_divisor) -{ - struct dn_fib_node *f, **fp, *next; - int i; - - for(i = 0; i < old_divisor; i++) { - for(f = old_ht[i]; f; f = next) { - next = f->fn_next; - for(fp = dn_chain_p(f->fn_key, dz); - *fp && dn_key_leq((*fp)->fn_key, f->fn_key); - fp = &(*fp)->fn_next) - /* NOTHING */; - f->fn_next = *fp; - *fp = f; - } - } -} - -static void dn_rehash_zone(struct dn_zone *dz) -{ - struct dn_fib_node **ht, **old_ht; - int old_divisor, new_divisor; - u32 new_hashmask; - - old_divisor = dz->dz_divisor; - - switch (old_divisor) { - case 16: - new_divisor = 256; - new_hashmask = 0xFF; - break; - default: - printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n", - old_divisor); - /* fall through */ - case 256: - new_divisor = 1024; - new_hashmask = 0x3FF; - break; - } - - ht = kcalloc(new_divisor, sizeof(struct dn_fib_node*), GFP_KERNEL); - if (ht == NULL) - return; - - write_lock_bh(&dn_fib_tables_lock); - old_ht = dz->dz_hash; - dz->dz_hash = ht; - dz->dz_hashmask = new_hashmask; - dz->dz_divisor = new_divisor; - dn_rebuild_zone(dz, old_ht, old_divisor); - write_unlock_bh(&dn_fib_tables_lock); - kfree(old_ht); -} - -static void dn_free_node(struct dn_fib_node *f) -{ - dn_fib_release_info(DN_FIB_INFO(f)); - kmem_cache_free(dn_hash_kmem, f); -} - - -static struct dn_zone *dn_new_zone(struct dn_hash *table, int z) -{ - int i; - struct dn_zone *dz = kzalloc(sizeof(struct dn_zone), GFP_KERNEL); - if (!dz) - return NULL; - - if (z) { - dz->dz_divisor = 16; - dz->dz_hashmask = 0x0F; - } else { - dz->dz_divisor = 1; - dz->dz_hashmask = 0; - } - - dz->dz_hash = kcalloc(dz->dz_divisor, sizeof(struct dn_fib_node *), GFP_KERNEL); - if (!dz->dz_hash) { - kfree(dz); - return NULL; - } - - dz->dz_order = z; - dz->dz_mask = dnet_make_mask(z); - - for(i = z + 1; i <= 16; i++) - if (table->dh_zones[i]) - break; - - write_lock_bh(&dn_fib_tables_lock); - if (i>16) { - dz->dz_next = table->dh_zone_list; - table->dh_zone_list = dz; - } else { - dz->dz_next = table->dh_zones[i]->dz_next; - table->dh_zones[i]->dz_next = dz; - } - table->dh_zones[z] = dz; - write_unlock_bh(&dn_fib_tables_lock); - return dz; -} - - -static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct nlattr *attrs[], struct dn_fib_info *fi) -{ - struct rtnexthop *nhp; - int nhlen; - - if (attrs[RTA_PRIORITY] && - nla_get_u32(attrs[RTA_PRIORITY]) != fi->fib_priority) - return 1; - - if (attrs[RTA_OIF] || attrs[RTA_GATEWAY]) { - if ((!attrs[RTA_OIF] || nla_get_u32(attrs[RTA_OIF]) == fi->fib_nh->nh_oif) && - (!attrs[RTA_GATEWAY] || nla_get_le16(attrs[RTA_GATEWAY]) != fi->fib_nh->nh_gw)) - return 0; - return 1; - } - - if (!attrs[RTA_MULTIPATH]) - return 0; - - nhp = nla_data(attrs[RTA_MULTIPATH]); - nhlen = nla_len(attrs[RTA_MULTIPATH]); - - for_nexthops(fi) { - int attrlen = nhlen - sizeof(struct rtnexthop); - __le16 gw; - - if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) - return -EINVAL; - if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) - return 1; - if (attrlen) { - struct nlattr *gw_attr; - - gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY); - gw = gw_attr ? nla_get_le16(gw_attr) : 0; - - if (gw && gw != nh->nh_gw) - return 1; - } - nhp = RTNH_NEXT(nhp); - } endfor_nexthops(fi); - - return 0; -} - -static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi) -{ - size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) - + nla_total_size(4) /* RTA_TABLE */ - + nla_total_size(2) /* RTA_DST */ - + nla_total_size(4) /* RTA_PRIORITY */ - + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ - - /* space for nested metrics */ - payload += nla_total_size((RTAX_MAX * nla_total_size(4))); - - if (fi->fib_nhs) { - /* Also handles the special case fib_nhs == 1 */ - - /* each nexthop is packed in an attribute */ - size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); - - /* may contain a gateway attribute */ - nhsize += nla_total_size(4); - - /* all nexthops are packed in a nested attribute */ - payload += nla_total_size(fi->fib_nhs * nhsize); - } - - return payload; -} - -static int dn_fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, void *dst, int dst_len, - struct dn_fib_info *fi, unsigned int flags) -{ - struct rtmsg *rtm; - struct nlmsghdr *nlh; - - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); - if (!nlh) - return -EMSGSIZE; - - rtm = nlmsg_data(nlh); - rtm->rtm_family = AF_DECnet; - rtm->rtm_dst_len = dst_len; - rtm->rtm_src_len = 0; - rtm->rtm_tos = 0; - rtm->rtm_table = tb_id; - rtm->rtm_flags = fi->fib_flags; - rtm->rtm_scope = scope; - rtm->rtm_type = type; - rtm->rtm_protocol = fi->fib_protocol; - - if (nla_put_u32(skb, RTA_TABLE, tb_id) < 0) - goto errout; - - if (rtm->rtm_dst_len && - nla_put(skb, RTA_DST, 2, dst) < 0) - goto errout; - - if (fi->fib_priority && - nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority) < 0) - goto errout; - - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) - goto errout; - - if (fi->fib_nhs == 1) { - if (fi->fib_nh->nh_gw && - nla_put_le16(skb, RTA_GATEWAY, fi->fib_nh->nh_gw) < 0) - goto errout; - - if (fi->fib_nh->nh_oif && - nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif) < 0) - goto errout; - } - - if (fi->fib_nhs > 1) { - struct rtnexthop *nhp; - struct nlattr *mp_head; - - mp_head = nla_nest_start_noflag(skb, RTA_MULTIPATH); - if (!mp_head) - goto errout; - - for_nexthops(fi) { - if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) - goto errout; - - nhp->rtnh_flags = nh->nh_flags & 0xFF; - nhp->rtnh_hops = nh->nh_weight - 1; - nhp->rtnh_ifindex = nh->nh_oif; - - if (nh->nh_gw && - nla_put_le16(skb, RTA_GATEWAY, nh->nh_gw) < 0) - goto errout; - - nhp->rtnh_len = skb_tail_pointer(skb) - (unsigned char *)nhp; - } endfor_nexthops(fi); - - nla_nest_end(skb, mp_head); - } - - nlmsg_end(skb, nlh); - return 0; - -errout: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - - -static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id, - struct nlmsghdr *nlh, struct netlink_skb_parms *req) -{ - struct sk_buff *skb; - u32 portid = req ? req->portid : 0; - int err = -ENOBUFS; - - skb = nlmsg_new(dn_fib_nlmsg_size(DN_FIB_INFO(f)), GFP_KERNEL); - if (skb == NULL) - goto errout; - - err = dn_fib_dump_info(skb, portid, nlh->nlmsg_seq, event, tb_id, - f->fn_type, f->fn_scope, &f->fn_key, z, - DN_FIB_INFO(f), 0); - if (err < 0) { - /* -EMSGSIZE implies BUG in dn_fib_nlmsg_size() */ - WARN_ON(err == -EMSGSIZE); - kfree_skb(skb); - goto errout; - } - rtnl_notify(skb, &init_net, portid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); - return; -errout: - if (err < 0) - rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err); -} - -static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb, - struct netlink_callback *cb, - struct dn_fib_table *tb, - struct dn_zone *dz, - struct dn_fib_node *f) -{ - int i, s_i; - - s_i = cb->args[4]; - for(i = 0; f; i++, f = f->fn_next) { - if (i < s_i) - continue; - if (f->fn_state & DN_S_ZOMBIE) - continue; - if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->n, - (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type, - f->fn_scope, &f->fn_key, dz->dz_order, - f->fn_info, NLM_F_MULTI) < 0) { - cb->args[4] = i; - return -1; - } - } - cb->args[4] = i; - return skb->len; -} - -static __inline__ int dn_hash_dump_zone(struct sk_buff *skb, - struct netlink_callback *cb, - struct dn_fib_table *tb, - struct dn_zone *dz) -{ - int h, s_h; - - s_h = cb->args[3]; - for(h = 0; h < dz->dz_divisor; h++) { - if (h < s_h) - continue; - if (h > s_h) - memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); - if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL) - continue; - if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) { - cb->args[3] = h; - return -1; - } - } - cb->args[3] = h; - return skb->len; -} - -static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) -{ - int m, s_m; - struct dn_zone *dz; - struct dn_hash *table = (struct dn_hash *)tb->data; - - s_m = cb->args[2]; - read_lock(&dn_fib_tables_lock); - for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) { - if (m < s_m) - continue; - if (m > s_m) - memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); - - if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) { - cb->args[2] = m; - read_unlock(&dn_fib_tables_lock); - return -1; - } - } - read_unlock(&dn_fib_tables_lock); - cb->args[2] = m; - - return skb->len; -} - -int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - unsigned int h, s_h; - unsigned int e = 0, s_e; - struct dn_fib_table *tb; - int dumped = 0; - - if (!net_eq(net, &init_net)) - return 0; - - if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && - ((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED) - return dn_cache_dump(skb, cb); - - s_h = cb->args[0]; - s_e = cb->args[1]; - - for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) { - e = 0; - hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist) { - if (e < s_e) - goto next; - if (dumped) - memset(&cb->args[2], 0, sizeof(cb->args) - - 2 * sizeof(cb->args[0])); - if (tb->dump(tb, skb, cb) < 0) - goto out; - dumped = 1; -next: - e++; - } - } -out: - cb->args[1] = e; - cb->args[0] = h; - - return skb->len; -} - -static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[], - struct nlmsghdr *n, struct netlink_skb_parms *req) -{ - struct dn_hash *table = (struct dn_hash *)tb->data; - struct dn_fib_node *new_f, *f, **fp, **del_fp; - struct dn_zone *dz; - struct dn_fib_info *fi; - int z = r->rtm_dst_len; - int type = r->rtm_type; - dn_fib_key_t key; - int err; - - if (z > 16) - return -EINVAL; - - dz = table->dh_zones[z]; - if (!dz && !(dz = dn_new_zone(table, z))) - return -ENOBUFS; - - dz_key_0(key); - if (attrs[RTA_DST]) { - __le16 dst = nla_get_le16(attrs[RTA_DST]); - if (dst & ~DZ_MASK(dz)) - return -EINVAL; - key = dz_key(dst, dz); - } - - if ((fi = dn_fib_create_info(r, attrs, n, &err)) == NULL) - return err; - - if (dz->dz_nent > (dz->dz_divisor << 2) && - dz->dz_divisor > DN_MAX_DIVISOR && - (z==16 || (1<<z) > dz->dz_divisor)) - dn_rehash_zone(dz); - - fp = dn_chain_p(key, dz); - - DN_FIB_SCAN(f, fp) { - if (dn_key_leq(key, f->fn_key)) - break; - } - - del_fp = NULL; - - if (f && (f->fn_state & DN_S_ZOMBIE) && - dn_key_eq(f->fn_key, key)) { - del_fp = fp; - fp = &f->fn_next; - f = *fp; - goto create; - } - - DN_FIB_SCAN_KEY(f, fp, key) { - if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority) - break; - } - - if (f && dn_key_eq(f->fn_key, key) && - fi->fib_priority == DN_FIB_INFO(f)->fib_priority) { - struct dn_fib_node **ins_fp; - - err = -EEXIST; - if (n->nlmsg_flags & NLM_F_EXCL) - goto out; - - if (n->nlmsg_flags & NLM_F_REPLACE) { - del_fp = fp; - fp = &f->fn_next; - f = *fp; - goto replace; - } - - ins_fp = fp; - err = -EEXIST; - - DN_FIB_SCAN_KEY(f, fp, key) { - if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority) - break; - if (f->fn_type == type && - f->fn_scope == r->rtm_scope && - DN_FIB_INFO(f) == fi) - goto out; - } - - if (!(n->nlmsg_flags & NLM_F_APPEND)) { - fp = ins_fp; - f = *fp; - } - } - -create: - err = -ENOENT; - if (!(n->nlmsg_flags & NLM_F_CREATE)) - goto out; - -replace: - err = -ENOBUFS; - new_f = kmem_cache_zalloc(dn_hash_kmem, GFP_KERNEL); - if (new_f == NULL) - goto out; - - new_f->fn_key = key; - new_f->fn_type = type; - new_f->fn_scope = r->rtm_scope; - DN_FIB_INFO(new_f) = fi; - - new_f->fn_next = f; - write_lock_bh(&dn_fib_tables_lock); - *fp = new_f; - write_unlock_bh(&dn_fib_tables_lock); - dz->dz_nent++; - - if (del_fp) { - f = *del_fp; - write_lock_bh(&dn_fib_tables_lock); - *del_fp = f->fn_next; - write_unlock_bh(&dn_fib_tables_lock); - - if (!(f->fn_state & DN_S_ZOMBIE)) - dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req); - if (f->fn_state & DN_S_ACCESSED) - dn_rt_cache_flush(-1); - dn_free_node(f); - dz->dz_nent--; - } else { - dn_rt_cache_flush(-1); - } - - dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req); - - return 0; -out: - dn_fib_release_info(fi); - return err; -} - - -static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[], - struct nlmsghdr *n, struct netlink_skb_parms *req) -{ - struct dn_hash *table = (struct dn_hash*)tb->data; - struct dn_fib_node **fp, **del_fp, *f; - int z = r->rtm_dst_len; - struct dn_zone *dz; - dn_fib_key_t key; - int matched; - - - if (z > 16) - return -EINVAL; - - if ((dz = table->dh_zones[z]) == NULL) - return -ESRCH; - - dz_key_0(key); - if (attrs[RTA_DST]) { - __le16 dst = nla_get_le16(attrs[RTA_DST]); - if (dst & ~DZ_MASK(dz)) - return -EINVAL; - key = dz_key(dst, dz); - } - - fp = dn_chain_p(key, dz); - - DN_FIB_SCAN(f, fp) { - if (dn_key_eq(f->fn_key, key)) - break; - if (dn_key_leq(key, f->fn_key)) - return -ESRCH; - } - - matched = 0; - del_fp = NULL; - DN_FIB_SCAN_KEY(f, fp, key) { - struct dn_fib_info *fi = DN_FIB_INFO(f); - - if (f->fn_state & DN_S_ZOMBIE) - return -ESRCH; - - matched++; - - if (del_fp == NULL && - (!r->rtm_type || f->fn_type == r->rtm_type) && - (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) && - (!r->rtm_protocol || - fi->fib_protocol == r->rtm_protocol) && - dn_fib_nh_match(r, n, attrs, fi) == 0) - del_fp = fp; - } - - if (del_fp) { - f = *del_fp; - dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req); - - if (matched != 1) { - write_lock_bh(&dn_fib_tables_lock); - *del_fp = f->fn_next; - write_unlock_bh(&dn_fib_tables_lock); - - if (f->fn_state & DN_S_ACCESSED) - dn_rt_cache_flush(-1); - dn_free_node(f); - dz->dz_nent--; - } else { - f->fn_state |= DN_S_ZOMBIE; - if (f->fn_state & DN_S_ACCESSED) { - f->fn_state &= ~DN_S_ACCESSED; - dn_rt_cache_flush(-1); - } - if (++dn_fib_hash_zombies > 128) - dn_fib_flush(); - } - - return 0; - } - - return -ESRCH; -} - -static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table) -{ - int found = 0; - struct dn_fib_node *f; - - while((f = *fp) != NULL) { - struct dn_fib_info *fi = DN_FIB_INFO(f); - - if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) { - write_lock_bh(&dn_fib_tables_lock); - *fp = f->fn_next; - write_unlock_bh(&dn_fib_tables_lock); - - dn_free_node(f); - found++; - continue; - } - fp = &f->fn_next; - } - - return found; -} - -static int dn_fib_table_flush(struct dn_fib_table *tb) -{ - struct dn_hash *table = (struct dn_hash *)tb->data; - struct dn_zone *dz; - int found = 0; - - dn_fib_hash_zombies = 0; - for(dz = table->dh_zone_list; dz; dz = dz->dz_next) { - int i; - int tmp = 0; - for(i = dz->dz_divisor-1; i >= 0; i--) - tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table); - dz->dz_nent -= tmp; - found += tmp; - } - - return found; -} - -static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowidn *flp, struct dn_fib_res *res) -{ - int err; - struct dn_zone *dz; - struct dn_hash *t = (struct dn_hash *)tb->data; - - read_lock(&dn_fib_tables_lock); - for(dz = t->dh_zone_list; dz; dz = dz->dz_next) { - struct dn_fib_node *f; - dn_fib_key_t k = dz_key(flp->daddr, dz); - - for(f = dz_chain(k, dz); f; f = f->fn_next) { - if (!dn_key_eq(k, f->fn_key)) { - if (dn_key_leq(k, f->fn_key)) - break; - else - continue; - } - - f->fn_state |= DN_S_ACCESSED; - - if (f->fn_state&DN_S_ZOMBIE) - continue; - - if (f->fn_scope < flp->flowidn_scope) - continue; - - err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res); - - if (err == 0) { - res->type = f->fn_type; - res->scope = f->fn_scope; - res->prefixlen = dz->dz_order; - goto out; - } - if (err < 0) - goto out; - } - } - err = 1; -out: - read_unlock(&dn_fib_tables_lock); - return err; -} - - -struct dn_fib_table *dn_fib_get_table(u32 n, int create) -{ - struct dn_fib_table *t; - unsigned int h; - - if (n < RT_TABLE_MIN) - return NULL; - - if (n > RT_TABLE_MAX) - return NULL; - - h = n & (DN_FIB_TABLE_HASHSZ - 1); - rcu_read_lock(); - hlist_for_each_entry_rcu(t, &dn_fib_table_hash[h], hlist) { - if (t->n == n) { - rcu_read_unlock(); - return t; - } - } - rcu_read_unlock(); - - if (!create) - return NULL; - - if (in_interrupt()) { - net_dbg_ratelimited("DECnet: BUG! Attempt to create routing table from interrupt\n"); - return NULL; - } - - t = kzalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash), - GFP_KERNEL); - if (t == NULL) - return NULL; - - t->n = n; - t->insert = dn_fib_table_insert; - t->delete = dn_fib_table_delete; - t->lookup = dn_fib_table_lookup; - t->flush = dn_fib_table_flush; - t->dump = dn_fib_table_dump; - hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]); - - return t; -} - -struct dn_fib_table *dn_fib_empty_table(void) -{ - u32 id; - - for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++) - if (dn_fib_get_table(id, 0) == NULL) - return dn_fib_get_table(id, 1); - return NULL; -} - -void dn_fib_flush(void) -{ - int flushed = 0; - struct dn_fib_table *tb; - unsigned int h; - - for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) { - hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist) - flushed += tb->flush(tb); - } - - if (flushed) - dn_rt_cache_flush(-1); -} - -void __init dn_fib_table_init(void) -{ - dn_hash_kmem = kmem_cache_create("dn_fib_info_cache", - sizeof(struct dn_fib_info), - 0, SLAB_HWCACHE_ALIGN, - NULL); -} - -void __exit dn_fib_table_cleanup(void) -{ - struct dn_fib_table *t; - struct hlist_node *next; - unsigned int h; - - write_lock(&dn_fib_tables_lock); - for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) { - hlist_for_each_entry_safe(t, next, &dn_fib_table_hash[h], - hlist) { - hlist_del(&t->hlist); - kfree(t); - } - } - write_unlock(&dn_fib_tables_lock); -} diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c deleted file mode 100644 index aa4155875ca8..000000000000 --- a/net/decnet/dn_timer.c +++ /dev/null @@ -1,104 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Socket Timer Functions - * - * Author: Steve Whitehouse <SteveW@ACM.org> - * - * - * Changes: - * Steve Whitehouse : Made keepalive timer part of the same - * timer idea. - * Steve Whitehouse : Added checks for sk->sock_readers - * David S. Miller : New socket locking - * Steve Whitehouse : Timer grabs socket ref. - */ -#include <linux/net.h> -#include <linux/socket.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/timer.h> -#include <linux/spinlock.h> -#include <net/sock.h> -#include <linux/atomic.h> -#include <linux/jiffies.h> -#include <net/flow.h> -#include <net/dn.h> - -/* - * Slow timer is for everything else (n * 500mS) - */ - -#define SLOW_INTERVAL (HZ/2) - -static void dn_slow_timer(struct timer_list *t); - -void dn_start_slow_timer(struct sock *sk) -{ - timer_setup(&sk->sk_timer, dn_slow_timer, 0); - sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL); -} - -void dn_stop_slow_timer(struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} - -static void dn_slow_timer(struct timer_list *t) -{ - struct sock *sk = from_timer(sk, t, sk_timer); - struct dn_scp *scp = DN_SK(sk); - - bh_lock_sock(sk); - - if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 10); - goto out; - } - - /* - * The persist timer is the standard slow timer used for retransmits - * in both connection establishment and disconnection as well as - * in the RUN state. The different states are catered for by changing - * the function pointer in the socket. Setting the timer to a value - * of zero turns it off. We allow the persist_fxn to turn the - * timer off in a permant way by returning non-zero, so that - * timer based routines may remove sockets. This is why we have a - * sock_hold()/sock_put() around the timer to prevent the socket - * going away in the middle. - */ - if (scp->persist && scp->persist_fxn) { - if (scp->persist <= SLOW_INTERVAL) { - scp->persist = 0; - - if (scp->persist_fxn(sk)) - goto out; - } else { - scp->persist -= SLOW_INTERVAL; - } - } - - /* - * Check for keepalive timeout. After the other timer 'cos if - * the previous timer caused a retransmit, we don't need to - * do this. scp->stamp is the last time that we sent a packet. - * The keepalive function sends a link service packet to the - * other end. If it remains unacknowledged, the standard - * socket timers will eventually shut the socket down. Each - * time we do this, scp->stamp will be updated, thus - * we won't try and send another until scp->keepalive has passed - * since the last successful transmission. - */ - if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) { - if (time_after_eq(jiffies, scp->stamp + scp->keepalive)) - scp->keepalive_fxn(sk); - } - - sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL); -out: - bh_unlock_sock(sk); - sock_put(sk); -} diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig deleted file mode 100644 index 14ec4ef95fab..000000000000 --- a/net/decnet/netfilter/Kconfig +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# DECnet netfilter configuration -# - -menu "DECnet: Netfilter Configuration" - depends on DECNET && NETFILTER - depends on NETFILTER_ADVANCED - -config DECNET_NF_GRABULATOR - tristate "Routing message grabulator (for userland routing daemon)" - help - Enable this module if you want to use the userland DECnet routing - daemon. You will also need to enable routing support for DECnet - unless you just want to monitor routing messages from other nodes. - -endmenu diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile deleted file mode 100644 index 429c84289d0f..000000000000 --- a/net/decnet/netfilter/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile for DECnet netfilter modules -# - -obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c deleted file mode 100644 index dc705769acc9..000000000000 --- a/net/decnet/netfilter/dn_rtmsg.c +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet Routing Message Grabulator - * - * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ - * - * Author: Steven Whitehouse <steve@chygwyn.com> - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/spinlock.h> -#include <net/netlink.h> -#include <linux/netfilter_decnet.h> - -#include <net/sock.h> -#include <net/flow.h> -#include <net/dn.h> -#include <net/dn_route.h> - -static struct sock *dnrmg = NULL; - - -static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp) -{ - struct sk_buff *skb = NULL; - size_t size; - sk_buff_data_t old_tail; - struct nlmsghdr *nlh; - unsigned char *ptr; - struct nf_dn_rtmsg *rtm; - - size = NLMSG_ALIGN(rt_skb->len) + - NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg)); - skb = nlmsg_new(size, GFP_ATOMIC); - if (!skb) { - *errp = -ENOMEM; - return NULL; - } - old_tail = skb->tail; - nlh = nlmsg_put(skb, 0, 0, 0, size, 0); - if (!nlh) { - kfree_skb(skb); - *errp = -ENOMEM; - return NULL; - } - rtm = (struct nf_dn_rtmsg *)nlmsg_data(nlh); - rtm->nfdn_ifindex = rt_skb->dev->ifindex; - ptr = NFDN_RTMSG(rtm); - skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len); - nlh->nlmsg_len = skb->tail - old_tail; - return skb; -} - -static void dnrmg_send_peer(struct sk_buff *skb) -{ - struct sk_buff *skb2; - int status = 0; - int group = 0; - unsigned char flags = *skb->data; - - switch (flags & DN_RT_CNTL_MSK) { - case DN_RT_PKT_L1RT: - group = DNRNG_NLGRP_L1; - break; - case DN_RT_PKT_L2RT: - group = DNRNG_NLGRP_L2; - break; - default: - return; - } - - skb2 = dnrmg_build_message(skb, &status); - if (skb2 == NULL) - return; - NETLINK_CB(skb2).dst_group = group; - netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC); -} - - -static unsigned int dnrmg_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - dnrmg_send_peer(skb); - return NF_ACCEPT; -} - - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err), NULL); return; } while (0) - -static inline void dnrmg_receive_user_skb(struct sk_buff *skb) -{ - struct nlmsghdr *nlh = nlmsg_hdr(skb); - - if (skb->len < sizeof(*nlh) || - nlh->nlmsg_len < sizeof(*nlh) || - skb->len < nlh->nlmsg_len) - return; - - if (!netlink_capable(skb, CAP_NET_ADMIN)) - RCV_SKB_FAIL(-EPERM); - - /* Eventually we might send routing messages too */ - - RCV_SKB_FAIL(-EINVAL); -} - -static const struct nf_hook_ops dnrmg_ops = { - .hook = dnrmg_hook, - .pf = NFPROTO_DECNET, - .hooknum = NF_DN_ROUTE, - .priority = NF_DN_PRI_DNRTMSG, -}; - -static int __init dn_rtmsg_init(void) -{ - int rv = 0; - struct netlink_kernel_cfg cfg = { - .groups = DNRNG_NLGRP_MAX, - .input = dnrmg_receive_user_skb, - }; - - dnrmg = netlink_kernel_create(&init_net, NETLINK_DNRTMSG, &cfg); - if (dnrmg == NULL) { - printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); - return -ENOMEM; - } - - rv = nf_register_net_hook(&init_net, &dnrmg_ops); - if (rv) { - netlink_kernel_release(dnrmg); - } - - return rv; -} - -static void __exit dn_rtmsg_fini(void) -{ - nf_unregister_net_hook(&init_net, &dnrmg_ops); - netlink_kernel_release(dnrmg); -} - - -MODULE_DESCRIPTION("DECnet Routing Message Grabulator"); -MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG); - -module_init(dn_rtmsg_init); -module_exit(dn_rtmsg_fini); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c deleted file mode 100644 index 55bf64a22b59..000000000000 --- a/net/decnet/sysctl_net_decnet.c +++ /dev/null @@ -1,373 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DECnet An implementation of the DECnet protocol suite for the LINUX - * operating system. DECnet is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * DECnet sysctl support functions - * - * Author: Steve Whitehouse <SteveW@ACM.org> - * - * - * Changes: - * Steve Whitehouse - C99 changes and default device handling - * Steve Whitehouse - Memory buffer settings, like the tcp ones - * - */ -#include <linux/mm.h> -#include <linux/sysctl.h> -#include <linux/fs.h> -#include <linux/netdevice.h> -#include <linux/string.h> -#include <net/neighbour.h> -#include <net/dst.h> -#include <net/flow.h> - -#include <linux/uaccess.h> - -#include <net/dn.h> -#include <net/dn_dev.h> -#include <net/dn_route.h> - - -int decnet_debug_level; -int decnet_time_wait = 30; -int decnet_dn_count = 1; -int decnet_di_count = 3; -int decnet_dr_count = 3; -int decnet_log_martians = 1; -int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; - -/* Reasonable defaults, I hope, based on tcp's defaults */ -long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; -int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; -int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; - -#ifdef CONFIG_SYSCTL -extern int decnet_dst_gc_interval; -static int min_decnet_time_wait[] = { 5 }; -static int max_decnet_time_wait[] = { 600 }; -static int min_state_count[] = { 1 }; -static int max_state_count[] = { NSP_MAXRXTSHIFT }; -static int min_decnet_dst_gc_interval[] = { 1 }; -static int max_decnet_dst_gc_interval[] = { 60 }; -static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW }; -static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW }; -static char node_name[7] = "???"; - -static struct ctl_table_header *dn_table_header = NULL; - -/* - * ctype.h :-) - */ -#define ISNUM(x) (((x) >= '0') && ((x) <= '9')) -#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z')) -#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z')) -#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x)) -#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x)) - -static void strip_it(char *str) -{ - for(;;) { - switch (*str) { - case ' ': - case '\n': - case '\r': - case ':': - *str = 0; - /* Fallthrough */ - case 0: - return; - } - str++; - } -} - -/* - * Simple routine to parse an ascii DECnet address - * into a network order address. - */ -static int parse_addr(__le16 *addr, char *str) -{ - __u16 area, node; - - while(*str && !ISNUM(*str)) str++; - - if (*str == 0) - return -1; - - area = (*str++ - '0'); - if (ISNUM(*str)) { - area *= 10; - area += (*str++ - '0'); - } - - if (*str++ != '.') - return -1; - - if (!ISNUM(*str)) - return -1; - - node = *str++ - '0'; - if (ISNUM(*str)) { - node *= 10; - node += (*str++ - '0'); - } - if (ISNUM(*str)) { - node *= 10; - node += (*str++ - '0'); - } - if (ISNUM(*str)) { - node *= 10; - node += (*str++ - '0'); - } - - if ((node > 1023) || (area > 63)) - return -1; - - if (INVALID_END_CHAR(*str)) - return -1; - - *addr = cpu_to_le16((area << 10) | node); - - return 0; -} - -static int dn_node_address_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - char addr[DN_ASCBUF_LEN]; - size_t len; - __le16 dnaddr; - - if (!*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - if (write) { - len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); - - if (copy_from_user(addr, buffer, len)) - return -EFAULT; - - addr[len] = 0; - strip_it(addr); - - if (parse_addr(&dnaddr, addr)) - return -EINVAL; - - dn_dev_devices_off(); - - decnet_address = dnaddr; - - dn_dev_devices_on(); - - *ppos += len; - - return 0; - } - - dn_addr2asc(le16_to_cpu(decnet_address), addr); - len = strlen(addr); - addr[len++] = '\n'; - - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, addr, len)) - return -EFAULT; - - *lenp = len; - *ppos += len; - - return 0; -} - -static int dn_def_dev_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - size_t len; - struct net_device *dev; - char devname[17]; - - if (!*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - if (write) { - if (*lenp > 16) - return -E2BIG; - - if (copy_from_user(devname, buffer, *lenp)) - return -EFAULT; - - devname[*lenp] = 0; - strip_it(devname); - - dev = dev_get_by_name(&init_net, devname); - if (dev == NULL) - return -ENODEV; - - if (dev->dn_ptr == NULL) { - dev_put(dev); - return -ENODEV; - } - - if (dn_dev_set_default(dev, 1)) { - dev_put(dev); - return -ENODEV; - } - *ppos += *lenp; - - return 0; - } - - dev = dn_dev_get_default(); - if (dev == NULL) { - *lenp = 0; - return 0; - } - - strcpy(devname, dev->name); - dev_put(dev); - len = strlen(devname); - devname[len++] = '\n'; - - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, devname, len)) - return -EFAULT; - - *lenp = len; - *ppos += len; - - return 0; -} - -static struct ctl_table dn_table[] = { - { - .procname = "node_address", - .maxlen = 7, - .mode = 0644, - .proc_handler = dn_node_address_handler, - }, - { - .procname = "node_name", - .data = node_name, - .maxlen = 7, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "default_device", - .maxlen = 16, - .mode = 0644, - .proc_handler = dn_def_dev_handler, - }, - { - .procname = "time_wait", - .data = &decnet_time_wait, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_decnet_time_wait, - .extra2 = &max_decnet_time_wait - }, - { - .procname = "dn_count", - .data = &decnet_dn_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_state_count, - .extra2 = &max_state_count - }, - { - .procname = "di_count", - .data = &decnet_di_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_state_count, - .extra2 = &max_state_count - }, - { - .procname = "dr_count", - .data = &decnet_dr_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_state_count, - .extra2 = &max_state_count - }, - { - .procname = "dst_gc_interval", - .data = &decnet_dst_gc_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_decnet_dst_gc_interval, - .extra2 = &max_decnet_dst_gc_interval - }, - { - .procname = "no_fc_max_cwnd", - .data = &decnet_no_fc_max_cwnd, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_decnet_no_fc_max_cwnd, - .extra2 = &max_decnet_no_fc_max_cwnd - }, - { - .procname = "decnet_mem", - .data = &sysctl_decnet_mem, - .maxlen = sizeof(sysctl_decnet_mem), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax - }, - { - .procname = "decnet_rmem", - .data = &sysctl_decnet_rmem, - .maxlen = sizeof(sysctl_decnet_rmem), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "decnet_wmem", - .data = &sysctl_decnet_wmem, - .maxlen = sizeof(sysctl_decnet_wmem), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "debug", - .data = &decnet_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; - -void dn_register_sysctl(void) -{ - dn_table_header = register_net_sysctl(&init_net, "net/decnet", dn_table); -} - -void dn_unregister_sysctl(void) -{ - unregister_net_sysctl_table(dn_table_header); -} - -#else /* CONFIG_SYSCTL */ -void dn_unregister_sysctl(void) -{ -} -void dn_register_sysctl(void) -{ -} - -#endif diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 73605bcbb385..7354c5db3a14 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -62,7 +62,8 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, if (!skb->dev) return NULL; - pskb_trim_rcsum(skb, skb->len - len); + if (pskb_trim_rcsum(skb, skb->len - len)) + return NULL; skb->offload_fwd_mark = true; diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 12f3ce52e62e..836a75030a52 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -48,8 +48,8 @@ static void sja1105_meta_unpack(const struct sk_buff *skb, * a unified unpacking command for both device series. */ packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0); - packing(buf + 4, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0); - packing(buf + 5, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0); + packing(buf + 4, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0); + packing(buf + 5, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0); packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0); packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0); } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9040fe55e0f5..2566fff87001 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -164,17 +164,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - if (unlikely(!ether_addr_equal_64bits(eth->h_dest, - dev->dev_addr))) { - if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { - if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) - skb->pkt_type = PACKET_BROADCAST; - else - skb->pkt_type = PACKET_MULTICAST; - } else { - skb->pkt_type = PACKET_OTHERHOST; - } - } + eth_skb_pkt_type(skb, dev); /* * Some variants of DSA tagging don't have an ethertype field diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index bf3ecf792688..7073724fdfa6 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -198,17 +198,18 @@ static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, struct hsr_node *node_src) { bool was_multicast_frame; - int res; + int res, recv_len; was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST); hsr_addr_subst_source(node_src, skb); skb_pull(skb, ETH_HLEN); + recv_len = skb->len; res = netif_rx(skb); if (res == NET_RX_DROP) { dev->stats.rx_dropped++; } else { dev->stats.rx_packets++; - dev->stats.rx_bytes += skb->len; + dev->stats.rx_bytes += recv_len; if (was_multicast_frame) dev->stats.multicast++; } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 4a9200729a32..1a0f447113f2 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -198,6 +198,10 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { + /* Check if skb contains hsr_ethhdr */ + if (skb->mac_len < sizeof(struct hsr_ethhdr)) + return NULL; + /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ @@ -269,9 +273,12 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); - list_del_rcu(&node_curr->mac_list); + if (!node_curr->removed) { + list_del_rcu(&node_curr->mac_list); + node_curr->removed = true; + kfree_rcu(node_curr, rcu_head); + } spin_unlock_bh(&hsr->list_lock); - kfree_rcu(node_curr, rcu_head); done: skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); @@ -436,9 +443,12 @@ void hsr_prune_nodes(struct timer_list *t) if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); - list_del_rcu(&node->mac_list); - /* Note that we need to free this entry later: */ - kfree_rcu(node, rcu_head); + if (!node->removed) { + list_del_rcu(&node->mac_list); + node->removed = true; + /* Note that we need to free this entry later: */ + kfree_rcu(node, rcu_head); + } } } spin_unlock_bh(&hsr->list_lock); diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 0f0fa12b4329..01f4ef4ae494 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -56,6 +56,7 @@ struct hsr_node { unsigned long time_in[HSR_PT_PORTS]; bool time_in_stale[HSR_PT_PORTS]; u16 seq_out[HSR_PT_PORTS]; + bool removed; struct rcu_head rcu_head; }; diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 9e389accbfc7..ea627e532aab 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -113,14 +113,21 @@ static struct notifier_block hsr_nb = { static int __init hsr_init(void) { - int res; + int err; BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN); - register_netdevice_notifier(&hsr_nb); - res = hsr_netlink_init(); + err = register_netdevice_notifier(&hsr_nb); + if (err) + return err; + + err = hsr_netlink_init(); + if (err) { + unregister_netdevice_notifier(&hsr_nb); + return err; + } - return res; + return 0; } static void __exit hsr_exit(void) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 9a675ba0bf0a..ce5f25c89dfa 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -201,8 +201,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) int err = 0; struct net_device *dev = NULL; - if (len < sizeof(*uaddr)) - return -EINVAL; + err = ieee802154_sockaddr_check_size(uaddr, len); + if (err < 0) + return err; uaddr = (struct sockaddr_ieee802154 *)_uaddr; if (uaddr->family != AF_IEEE802154) @@ -272,6 +273,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = -EMSGSIZE; goto out_dev; } + if (!size) { + err = 0; + goto out_dev; + } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; @@ -498,11 +503,14 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) ro->bound = 0; - if (len < sizeof(*addr)) + err = ieee802154_sockaddr_check_size(addr, len); + if (err < 0) goto out; - if (addr->family != AF_IEEE802154) + if (addr->family != AF_IEEE802154) { + err = -EINVAL; goto out; + } ieee802154_addr_from_sa(&haddr, &addr->addr); dev = ieee802154_get_dev(sock_net(sk), &haddr); @@ -569,8 +577,9 @@ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, struct dgram_sock *ro = dgram_sk(sk); int err = 0; - if (len < sizeof(*addr)) - return -EINVAL; + err = ieee802154_sockaddr_check_size(addr, len); + if (err < 0) + return err; if (addr->family != AF_IEEE802154) return -EINVAL; @@ -609,6 +618,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); struct ieee802154_addr dst_addr; + DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); int hlen, tlen; int err; @@ -617,10 +627,20 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) return -EOPNOTSUPP; } - if (!ro->connected && !msg->msg_name) - return -EDESTADDRREQ; - else if (ro->connected && msg->msg_name) - return -EISCONN; + if (msg->msg_name) { + if (ro->connected) + return -EISCONN; + if (msg->msg_namelen < IEEE802154_MIN_NAMELEN) + return -EINVAL; + err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen); + if (err < 0) + return err; + ieee802154_addr_from_sa(&dst_addr, &daddr->addr); + } else { + if (!ro->connected) + return -EDESTADDRREQ; + dst_addr = ro->dst_addr; + } if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); @@ -656,16 +676,6 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) cb = mac_cb_init(skb); cb->type = IEEE802154_FC_TYPE_DATA; cb->ackreq = ro->want_ack; - - if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_ieee802154*, - daddr, msg->msg_name); - - ieee802154_addr_from_sa(&dst_addr, &daddr->addr); - } else { - dst_addr = ro->dst_addr; - } - cb->secen = ro->secen; cb->secen_override = ro->secen_override; cb->seclevel = ro->seclevel; diff --git a/net/ife/ife.c b/net/ife/ife.c index 13bbf8cb6a39..be05b690b9ef 100644 --- a/net/ife/ife.c +++ b/net/ife/ife.c @@ -82,6 +82,7 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen) if (unlikely(!pskb_may_pull(skb, total_pull))) return NULL; + ifehdr = (struct ifeheadr *)(skb->data + skb->dev->hard_header_len); skb_set_mac_header(skb, total_pull); __skb_pull(skb, total_pull); *metalen = ifehdrln - IFE_METAHDRLEN; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a926de2e42b5..f5af8c6b2f87 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -389,6 +389,16 @@ config INET_IPCOMP If unsure, say Y. +config INET_TABLE_PERTURB_ORDER + int "INET: Source port perturbation table size (as power of 2)" if EXPERT + default 16 + help + Source port perturbation table size (as power of 2) for + RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm. + + The default is almost always what you want. + Only change this if you know what you are doing. + config INET_XFRM_TUNNEL tristate select INET_TUNNEL diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d61ca7be6eda..d7ebee3c048d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -157,7 +157,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); - dst_release(sk->sk_rx_dst); + dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -578,6 +578,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; + sk->sk_wait_pending++; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -593,6 +594,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; + sk->sk_wait_pending--; return timeo; } @@ -874,7 +876,7 @@ int inet_shutdown(struct socket *sock, int how) EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ /* fall through */ default: - sk->sk_shutdown |= how; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how); if (sk->sk_prot->shutdown) sk->sk_prot->shutdown(sk, how); break; @@ -1570,10 +1572,12 @@ EXPORT_SYMBOL(inet_current_timestamp); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { - if (sk->sk_family == AF_INET) + unsigned int family = READ_ONCE(sk->sk_family); + + if (family == AF_INET) return ip_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) + if (family == AF_INET6) return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); #endif return -EINVAL; @@ -1702,12 +1706,7 @@ static const struct net_protocol igmp_protocol = { }; #endif -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct net_protocol tcp_protocol = { - .early_demux = tcp_v4_early_demux, - .early_demux_handler = tcp_v4_early_demux, +static const struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, @@ -1715,12 +1714,7 @@ static struct net_protocol tcp_protocol = { .icmp_strict_tag_validation = 1, }; -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct net_protocol udp_protocol = { - .early_demux = udp_v4_early_demux, - .early_demux_handler = udp_v4_early_demux, +static const struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 4a8ad46397c0..ed00b233cee2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -351,14 +351,14 @@ static void __inet_del_ifa(struct in_device *in_dev, { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; - struct in_ifaddr *last_prim; + struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); - last_prim = rtnl_dereference(in_dev->ifa_list); + last_prim = ifap; if (in_dev->dead) goto no_promotions; @@ -372,7 +372,7 @@ static void __inet_del_ifa(struct in_device *in_dev, while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) - last_prim = ifa; + last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || @@ -436,9 +436,9 @@ no_promotions: rcu_assign_pointer(prev_prom->ifa_next, next_sec); - last_sec = rtnl_dereference(last_prim->ifa_next); + last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); - rcu_assign_pointer(last_prim->ifa_next, promote); + rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; @@ -1798,6 +1798,21 @@ done: return err; } +/* Combine dev_addr_genid and dev_base_seq to detect changes. + */ +static u32 inet_base_seq(const struct net *net) +{ + u32 res = atomic_read(&net->ipv4.dev_addr_genid) + + net->dev_base_seq; + + /* Must not return 0 (see nl_dump_check_consistent()). + * Chose a value far away from 0. + */ + if (!res) + res = 0x80000000; + return res; +} + static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; @@ -1849,8 +1864,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; head = &tgt_net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^ - tgt_net->dev_base_seq; + cb->seq = inet_base_seq(tgt_net); hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; @@ -2249,8 +2263,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, idx = 0; head = &net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ - net->dev_base_seq; + cb->seq = inet_base_seq(net); hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f555dd4bac65..9a8f0e36bbf9 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -567,7 +567,9 @@ static inline int esp_remove_trailer(struct sk_buff *skb) skb->csum = csum_block_sub(skb->csum, csumdiff, skb->len - trimlen); } - pskb_trim(skb, skb->len - trimlen); + ret = pskb_trim(skb, skb->len - trimlen); + if (unlikely(ret)) + return ret; ret = nexthdr[1]; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 8c0af30fb067..5cd219d7e746 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -283,6 +283,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ef3e7a3e3a29..c31003d8c22f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -399,7 +399,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; return ret; } if (no_addr) @@ -411,7 +411,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; } return ret; @@ -583,6 +583,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, cfg->fc_scope = RT_SCOPE_UNIVERSE; } + if (!cfg->fc_table) + cfg->fc_table = RT_TABLE_MAIN; + if (cmd == SIOCDELRT) return 0; @@ -840,6 +843,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, return -EINVAL; } + if (!cfg->fc_table) + cfg->fc_table = RT_TABLE_MAIN; + return 0; errout: return err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 28da0443f3e9..2890dbe08d17 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/netlink.h> #include <linux/hash.h> +#include <linux/nospec.h> #include <net/arp.h> #include <net/ip.h> @@ -274,7 +275,8 @@ void fib_release_info(struct fib_info *fi) hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } - fi->fib_dead = 1; + /* Paired with READ_ONCE() from fib_table_lookup() */ + WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } spin_unlock_bh(&fib_info_lock); @@ -420,6 +422,7 @@ static struct fib_info *fib_find_info(struct fib_info *nfi) nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && nfi->fib_type == fi->fib_type && + nfi->fib_tb_id == fi->fib_tb_id && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && @@ -875,13 +878,15 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, return 1; } + if (fi->nh) { + if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp) + return 1; + return 0; + } + if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; - /* cannot match on nexthop object attributes */ - if (fi->nh) - return 1; - nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, @@ -1006,6 +1011,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) if (type > RTAX_MAX) return false; + type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; @@ -1327,15 +1333,18 @@ __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { struct fib_nh *nh; + __be32 saddr; if (nhc->nhc_family != AF_INET) return inet_select_addr(nhc->nhc_dev, 0, scope); nh = container_of(nhc, struct fib_nh, nh_common); - nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); - nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); + + WRITE_ONCE(nh->nh_saddr, saddr); + WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid)); - return nh->nh_saddr; + return saddr; } __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) @@ -1349,8 +1358,9 @@ __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); - if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) - return nh->nh_saddr; + if (READ_ONCE(nh->nh_saddr_genid) == + atomic_read(&net->ipv4.dev_addr_genid)) + return READ_ONCE(nh->nh_saddr); } return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); @@ -1581,6 +1591,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, link_it: ofi = fib_find_info(fi); if (ofi) { + /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); ofi->fib_treeref++; @@ -1618,6 +1629,7 @@ err_inval: failure: if (fi) { + /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index a1f830da4ad3..7f933ead3bf4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1448,7 +1448,8 @@ found: } if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; - if (fi->fib_dead) + /* Paired with WRITE_ONCE() in fib_release_info() */ + if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index b44f51e404ae..ac82a4158b86 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -754,6 +754,11 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); + /* Guard against tiny mtu. We need to include at least one + * IP network header for this message to make any sense. + */ + if (room <= (int)sizeof(struct iphdr)) + goto ende; icmp_param.data_len = skb_in->len - icmp_param.offset; if (icmp_param.data_len > room) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 660b41040c77..715f99e76826 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -218,8 +218,10 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) int tv = prandom_u32() % max_delay; im->tm_running = 1; - if (!mod_timer(&im->timer, jiffies+tv+2)) - refcount_inc(&im->refcnt); + if (refcount_inc_not_zero(&im->refcnt)) { + if (mod_timer(&im->timer, jiffies + tv + 2)) + ip_ma_put(im); + } } static void igmp_gq_start_timer(struct in_device *in_dev) @@ -355,8 +357,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - unsigned int size = mtu; + unsigned int size; + size = min(mtu, IP_MAX_MTU); while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); @@ -829,7 +832,7 @@ static void igmp_ifc_event(struct in_device *in_dev) struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv); + WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); igmp_ifc_start_timer(in_dev, 1); } @@ -1011,7 +1014,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * received value was zero, use the default or statically * configured value. */ - in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; /* RFC3376, 8.3. Query Response Interval: @@ -1191,7 +1194,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1242,9 +1245,11 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) swap(im->tomb, pmc->tomb); swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: + READ_ONCE(net->ipv4.sysctl_igmp_qrv); } else { - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: + READ_ONCE(net->ipv4.sysctl_igmp_qrv); } in_dev_put(pmc->interface); kfree_pmc(pmc); @@ -1351,7 +1356,7 @@ static void igmp_group_added(struct ip_mc_list *im) if (in_dev->dead) return; - im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; + im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); @@ -1365,7 +1370,7 @@ static void igmp_group_added(struct ip_mc_list *im) * IN() to IN(A). */ if (im->sfmode == MCAST_EXCLUDE) - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); igmp_ifc_event(in_dev); #endif @@ -1756,7 +1761,7 @@ static void ip_mc_reset(struct in_device *in_dev) in_dev->mr_qi = IGMP_QUERY_INTERVAL; in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv); } #else static void ip_mc_reset(struct in_device *in_dev) @@ -1890,7 +1895,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1954,7 +1959,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2133,7 +2138,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6cbf0db57ad0..091999dbef33 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -560,6 +560,20 @@ void inet_csk_clear_xmit_timers(struct sock *sk) } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); +void inet_csk_clear_xmit_timers_sync(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* ongoing timer handlers need to acquire socket lock. */ + sock_not_owned_by_me(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; + + sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); + sk_stop_timer_sync(sk, &sk->sk_timer); +} + void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); @@ -824,6 +838,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); + newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; @@ -902,11 +917,25 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); +static int inet_ulp_can_listen(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ulp_ops) + return -EINVAL; + + return 0; +} + int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - int err = -EADDRINUSE; + int err; + + err = inet_ulp_can_listen(sk); + if (unlikely(err)) + return err; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -918,6 +947,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ + err = -EADDRINUSE; inet_sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d9bee15e36a5..e4f2790fd641 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -675,13 +675,13 @@ EXPORT_SYMBOL_GPL(inet_unhash); * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. + * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though - * attacks were since demonstrated, thus we use 65536 instead to really - * give more isolation and privacy, at the expense of 256kB of kernel - * memory. + * attacks were since demonstrated, thus we use 65536 by default instead + * to really give more isolation and privacy, at the expense of 256kB + * of kernel memory. */ -#define INET_TABLE_PERTURB_SHIFT 16 -#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) +#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, @@ -701,17 +701,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 index; if (port) { - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL, NULL); - spin_unlock_bh(&head->lock); - return 0; - } - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ + local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; @@ -725,8 +715,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (likely(remaining > 1)) remaining &= ~1U; - net_get_random_once(table_perturb, - INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); + get_random_slow_once(table_perturb, + INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c411c87ae865..85cb44bfa3ba 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -254,12 +254,12 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); +/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) { - struct inet_timewait_sock *tw; - struct sock *sk; struct hlist_nulls_node *node; unsigned int slot; + struct sock *sk; for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; @@ -268,25 +268,35 @@ restart_rcu: rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (sk->sk_state != TCP_TIME_WAIT) + int state = inet_sk_state_load(sk); + + if ((1 << state) & ~(TCPF_TIME_WAIT | + TCPF_NEW_SYN_RECV)) continue; - tw = inet_twsk(sk); - if ((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->count)) + + if (sk->sk_family != family || + refcount_read(&sock_net(sk)->count)) continue; - if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (unlikely((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->count))) { - inet_twsk_put(tw); + if (unlikely(sk->sk_family != family || + refcount_read(&sock_net(sk)->count))) { + sock_gen_put(sk); goto restart; } rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule_put(tw); + if (state == TCP_TIME_WAIT) { + inet_twsk_deschedule_put(inet_twsk(sk)); + } else { + struct request_sock *req = inet_reqsk(sk); + + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, + req); + } local_bh_enable(); goto restart_rcu; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 52dbffb7bc2f..3aae46b84577 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -248,6 +248,15 @@ static void gre_err(struct sk_buff *skb, u32 info) ipgre_err(skb, info, &tpi); } +static bool is_erspan_type1(int gre_hdr_len) +{ + /* Both ERSPAN type I (version 0) and type II (version 1) use + * protocol 0x88BE, but the type I has only 4-byte GRE header, + * while type II has 8-byte. + */ + return gre_hdr_len == 4; +} + static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int gre_hdr_len) { @@ -262,17 +271,31 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int len; itn = net_generic(net, erspan_net_id); - iph = ip_hdr(skb); - ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); - ver = ershdr->ver; + if (is_erspan_type1(gre_hdr_len)) { + ver = 0; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + } else { + if (unlikely(!pskb_may_pull(skb, + gre_hdr_len + sizeof(*ershdr)))) + return PACKET_REJECT; - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_KEY, - iph->saddr, iph->daddr, tpi->key); + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = ershdr->ver; + iph = ip_hdr(skb); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_KEY, + iph->saddr, iph->daddr, tpi->key); + } if (tunnel) { - len = gre_hdr_len + erspan_hdr_len(ver); + if (is_erspan_type1(gre_hdr_len)) + len = gre_hdr_len; + else + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; @@ -437,7 +460,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -475,7 +498,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -525,7 +548,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) truncate = true; } - nhoff = skb_network_header(skb) - skb_mac_header(skb); + nhoff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP) && (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; @@ -534,7 +557,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) int thoff; if (skb_transport_header_was_set(skb)) - thoff = skb_transport_header(skb) - skb_mac_header(skb); + thoff = skb_transport_offset(skb); else thoff = nhoff + sizeof(struct ipv6hdr); if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) @@ -557,7 +580,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) } gre_build_header(skb, 8, TUNNEL_SEQ, - proto, 0, htonl(tunnel->o_seqno++)); + proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -607,15 +630,18 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { + int pull_len = tunnel->hlen + sizeof(struct iphdr); + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; - /* Pull skb since ip_tunnel_xmit() needs skb->data pointing - * to gre header. - */ - skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + if (!pskb_network_may_pull(skb, pull_len)) + goto free_skb; + + /* ip_tunnel_xmit() needs skb->data pointing to gre header. */ + skb_pull(skb, pull_len); skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && @@ -667,7 +693,10 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - if (tunnel->erspan_ver == 1) { + if (tunnel->erspan_ver == 0) { + proto = htons(ETH_P_ERSPAN); + tunnel->parms.o_flags &= ~TUNNEL_SEQ; + } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); @@ -1077,7 +1106,11 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], if (ret) return ret; - /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_ERSPAN_VER] && + nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0) + return 0; + + /* ERSPAN type II/III should only have GRE sequence and key flag */ if (data[IFLA_GRE_OFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); if (data[IFLA_GRE_IFLAGS]) @@ -1185,7 +1218,7 @@ static int erspan_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->erspan_ver != 1 && t->erspan_ver != 2) + if (t->erspan_ver > 2) return -EINVAL; } @@ -1270,7 +1303,11 @@ static int erspan_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - tunnel->tun_hlen = 8; + if (tunnel->erspan_ver == 0) + tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */ + else + tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */ + tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->erspan_ver); @@ -1467,24 +1504,6 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if (t->erspan_ver == 1 || t->erspan_ver == 2) { - if (!t->collect_md) - o_flags |= TUNNEL_KEY; - - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) - goto nla_put_failure; - - if (t->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) - goto nla_put_failure; - } else { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) - goto nla_put_failure; - } - } - if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, gre_tnl_flags_to_gre_flags(p->i_flags)) || @@ -1525,6 +1544,34 @@ nla_put_failure: return -EMSGSIZE; } +static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + if (t->erspan_ver <= 2) { + if (t->erspan_ver != 0 && !t->collect_md) + t->parms.o_flags |= TUNNEL_KEY; + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } + } + + return ipgre_fill_info(skb, dev); + +nla_put_failure: + return -EMSGSIZE; +} + static void erspan_setup(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1603,7 +1650,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { .changelink = erspan_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, - .fill_info = ipgre_fill_info, + .fill_info = erspan_fill_info, .get_link_net = ip_tunnel_get_link_net, }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c59a78a267c3..1464e2738211 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -302,31 +302,38 @@ drop: return true; } -INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); +int udp_v4_early_demux(struct sk_buff *); +int tcp_v4_early_demux(struct sk_buff *); static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); - int (*edemux)(struct sk_buff *skb); struct rtable *rt; int err; - if (net->ipv4.sysctl_ip_early_demux && + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { - const struct net_protocol *ipprot; - int protocol = iph->protocol; - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux, - udp_v4_early_demux, skb); - if (unlikely(err)) - goto drop_error; - /* must reload iph, skb->head might have changed */ - iph = ip_hdr(skb); + switch (iph->protocol) { + case IPPROTO_TCP: + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { + tcp_v4_early_demux(skb); + + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; + case IPPROTO_UDP: + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { + err = udp_v4_early_demux(skb); + if (unlikely(err)) + goto drop_error; + + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 418e93987800..d57d484a929f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -222,7 +222,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); - if (res < 0 || res == LWTUNNEL_XMIT_DONE) + if (res != LWTUNNEL_XMIT_CONTINUE) return res; } @@ -993,7 +993,7 @@ static int __ip_append_data(struct sock *sk, mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; - if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + if (cork->tx_flags & SKBTX_ANY_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; @@ -1255,6 +1255,12 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, if (unlikely(!rt)) return -EFAULT; + cork->fragsize = ip_sk_use_pmtu(sk) ? + dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + + if (!inetdev_valid_mtu(cork->fragsize)) + return -ENETUNREACH; + /* * setup for corking. */ @@ -1271,12 +1277,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->addr = ipc->addr; } - cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); - - if (!inetdev_valid_mtu(cork->fragsize)) - return -ENETUNREACH; - cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; @@ -1559,9 +1559,19 @@ struct sk_buff *__ip_make_skb(struct sock *sk, cork->dst = NULL; skb_dst_set(skb, &rt->dst); - if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(net, ((struct icmphdr *) - skb_transport_header(skb))->type); + if (iph->protocol == IPPROTO_ICMP) { + u8 icmp_type; + + /* For such sockets, transhdrlen is zero when do ip_append_data(), + * so icmphdr does not in skb linear region and can not get icmp_type + * by icmp_hdr(skb)->type. + */ + if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl) + icmp_type = fl4->fl4_icmp_type; + else + icmp_type = icmp_hdr(skb)->type; + icmp_out_count(net, icmp_type); + } ip_cork_release(cork); out: diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aa3fd61818c4..0c528f642eb8 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -316,7 +316,14 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, ipc->tos = val; ipc->priority = rt_tos2priority(ipc->tos); break; - + case IP_PROTOCOL: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 1 || val > 255) + return -EINVAL; + ipc->protocol = val; + break; default: return -EINVAL; } @@ -1524,6 +1531,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MINTTL: val = inet->min_ttl; break; + case IP_PROTOCOL: + val = inet_sk(sk)->inet_num; + break; default: release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 38d3095ef979..906c37c7f80d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -365,7 +365,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, { struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); - int err; + int nh, err; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { @@ -391,8 +391,21 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } + /* Save offset of outer header relative to skb->head, + * because we are going to reset the network header to the inner header + * and might change skb->head. + */ + nh = skb_network_header(skb) - skb->head; + skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); + if (!pskb_inet_may_pull(skb)) { + DEV_STATS_INC(tunnel->dev, rx_length_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); + goto drop; + } + iph = (struct iphdr *)(skb->head + nh); + err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -547,6 +560,20 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } +static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) +{ + /* we must cap headroom to some upperlimit, else pskb_expand_head + * will overflow header offsets in skb_headers_offset_update(). + */ + static const unsigned int max_allowed = 512; + + if (headroom > max_allowed) + headroom = max_allowed; + + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); +} + void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { @@ -620,13 +647,13 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; - if (headroom > dev->needed_headroom) - dev->needed_headroom = headroom; - - if (skb_cow_head(skb, dev->needed_headroom)) { + if (skb_cow_head(skb, headroom)) { ip_rt_put(rt); goto tx_dropped; } + + ip_tunnel_adj_headroom(dev, headroom); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return; @@ -804,16 +831,16 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; - if (skb_cow_head(skb, dev->needed_headroom)) { + if (skb_cow_head(skb, max_headroom)) { ip_rt_put(rt); dev->stats.tx_dropped++; kfree_skb(skb); return; } + ip_tunnel_adj_headroom(dev, max_headroom); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index bd41354ed8c1..275f2ecf0ba6 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -314,12 +314,12 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->protocol) { case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + xfrm_decode_session(skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): - xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + xfrm_decode_session(skb, &fl, AF_INET6); break; default: goto tx_err; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 3205d5f7c8c9..4966ac2aaf87 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> +#include <linux/nospec.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/ip.h> @@ -28,6 +29,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, return -EINVAL; } + type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 36a28d46149c..1b669e9a3ca7 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -92,11 +92,11 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, const struct net_device *indev) { - __be32 uninitialized_var(daddr), uninitialized_var(saddr); - __be16 uninitialized_var(dport), uninitialized_var(sport); + __be32 daddr, saddr; + __be16 dport, sport; const struct iphdr *iph = ip_hdr(skb); struct sk_buff *data_skb = NULL; - u8 uninitialized_var(protocol); + u8 protocol; #if IS_ENABLED(CONFIG_NF_CONNTRACK) enum ip_conntrack_info ctinfo; struct nf_conn const *ct; diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index b2bae0b0e42a..61cb2341f50f 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -38,7 +38,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, hp->source, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule_put(inet_twsk(sk)); + nf_tproxy_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index abf89b972094..330349b5d6a4 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -13,8 +13,8 @@ #include <net/netfilter/ipv4/nf_dup_ipv4.h> struct nft_dup_ipv4 { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; + u8 sreg_addr; + u8 sreg_dev; }; static void nft_dup_ipv4_eval(const struct nft_expr *expr, @@ -40,16 +40,16 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr)); + err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + sizeof(struct in_addr)); if (err < 0) return err; - if (tb[NFTA_DUP_SREG_DEV] != NULL) { - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); - } - return 0; + if (tb[NFTA_DUP_SREG_DEV]) + err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + &priv->sreg_dev, sizeof(int)); + + return err; } static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index ce294113dbcd..85eac5aa5204 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -83,6 +83,9 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; + if (priv->flags & NFTA_FIB_F_IIF) + fl4.flowi4_oif = l3mdev_master_ifindex_rcu(oif); + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { nft_fib_store_result(dest, priv, nft_in(pkt)); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 4d69b3de980a..0137854a7faa 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1222,7 +1222,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, - fib_nh->fib_nh_scope); + !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ddc24e57dc55..1044f4985922 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -558,6 +558,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } ipcm_init_sk(&ipc, inet); + /* Keep backward compat */ + if (hdrincl) + ipc.protocol = IPPROTO_RAW; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -625,7 +628,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, - hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7004e379c325..5b008d838e2b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -799,7 +799,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow goto reject_redirect; } - n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); + n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { @@ -945,13 +945,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; -#ifdef CONFIG_IP_ROUTE_VERBOSE - if (log_martians && + if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); -#endif } out_put_peer: inet_putpeer(peer); @@ -1221,6 +1219,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct net_device *dev; struct ip_options opt; int res; @@ -1238,7 +1237,8 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; + res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3f6c9514c7a9..3cc22f776a31 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -43,7 +43,6 @@ static siphash_key_t syncookie_secret[2] __read_mostly; * requested/supported by the syn/synack exchange. */ #define TSBITS 6 -#define TSMASK (((__u32)1 << TSBITS) - 1) static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) @@ -62,29 +61,24 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -u64 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req, u64 now) { - struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp_raw(); + const struct inet_request_sock *ireq = inet_rsk(req); + u64 ts, ts_now = tcp_ns_to_ts(now); u32 options = 0; - ireq = inet_rsk(req); - options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; if (ireq->sack_ok) options |= TS_OPT_SACK; if (ireq->ecn_ok) options |= TS_OPT_ECN; - ts = ts_now & ~TSMASK; + ts = (ts_now >> TSBITS) << TSBITS; ts |= options; - if (ts > ts_now) { - ts >>= TSBITS; - ts--; - ts <<= TSBITS; - ts |= options; - } - return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ); + if (ts > ts_now) + ts -= (1UL << TSBITS); + + return ts * (NSEC_PER_SEC / TCP_TS_HZ); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c83a5d05aeaa..4d4dba1d42ae 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -363,61 +363,6 @@ bad_key: return ret; } -static void proc_configure_early_demux(int enabled, int protocol) -{ - struct net_protocol *ipprot; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_protocol *ip6prot; -#endif - - rcu_read_lock(); - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot) - ipprot->early_demux = enabled ? ipprot->early_demux_handler : - NULL; - -#if IS_ENABLED(CONFIG_IPV6) - ip6prot = rcu_dereference(inet6_protos[protocol]); - if (ip6prot) - ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : - NULL; -#endif - rcu_read_unlock(); -} - -static int proc_tcp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = 0; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (write && !ret) { - int enabled = init_net.ipv4.sysctl_tcp_early_demux; - - proc_configure_early_demux(enabled, IPPROTO_TCP); - } - - return ret; -} - -static int proc_udp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = 0; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (write && !ret) { - int enabled = init_net.ipv4.sysctl_udp_early_demux; - - proc_configure_early_demux(enabled, IPPROTO_UDP); - } - - return ret; -} - static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, int write, void __user *buffer, @@ -701,14 +646,14 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_udp_early_demux, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_udp_early_demux + .proc_handler = proc_douintvec_minmax, }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_tcp_early_demux + .proc_handler = proc_douintvec_minmax, }, { .procname = "ip_default_ttl", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4b31f6e9ec61..ca7863f72218 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -429,6 +429,7 @@ void tcp_init_sock(struct sock *sk) /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; + tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -504,6 +505,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + u8 shutdown; int state; sock_poll_wait(file, sock, wait); @@ -546,9 +548,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ @@ -564,8 +567,8 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tcp_stream_is_readable(tp, target, sk)) mask |= EPOLLIN | EPOLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { + if (!(shutdown & SEND_SHUTDOWN)) { + if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -577,7 +580,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * pairs with the input side. */ smp_mb__after_atomic(); - if (sk_stream_is_writeable(sk)) + if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else @@ -697,7 +700,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sock_net(sk)->ipv4.sysctl_tcp_autocorking && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -722,6 +725,7 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. @@ -2349,14 +2353,13 @@ bool tcp_check_oom(struct sock *sk, int shift) return too_many_orphans || out_of_socket_memory; } -void tcp_close(struct sock *sk, long timeout) +void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); @@ -2519,7 +2522,15 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); +} + +void tcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __tcp_close(sk, timeout); release_sock(sk); + if (!sk->sk_net_refcnt) + inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); @@ -2580,6 +2591,12 @@ int tcp_disconnect(struct sock *sk, int flags) int old_state = sk->sk_state; u32 seq; + /* Deny disconnect if other threads are blocked in sk_wait_event() + * or inet_wait_for_connect(). + */ + if (sk->sk_wait_pending) + return -EBUSY; + if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -2616,7 +2633,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); - sk->sk_shutdown = 0; + WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -2635,6 +2652,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; + tp->is_cwnd_limited = 0; + tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; @@ -2652,8 +2671,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - dst_release(sk->sk_rx_dst); - sk->sk_rx_dst = NULL; + dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; @@ -2674,6 +2692,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->last_oow_ack_time = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; + tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; @@ -3057,18 +3076,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) - tp->linger2 = -1; - else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) - tp->linger2 = 0; + WRITE_ONCE(tp->linger2, -1); + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else - tp->linger2 = val * HZ; + WRITE_ONCE(tp->linger2, val * HZ); break; case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ - icsk->icsk_accept_queue.rskq_defer_accept = - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ); + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); break; case TCP_WINDOW_CLAMP: @@ -3156,7 +3175,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; + WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: @@ -3168,7 +3187,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_TX_DELAY: if (val) tcp_enable_tx_delay(); - tp->tcp_tx_delay = val; + WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; @@ -3434,15 +3453,16 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (get_user(len, optlen)) return -EFAULT; - len = min_t(unsigned int, len, sizeof(int)); - if (len < 0) return -EINVAL; + len = min_t(unsigned int, len, sizeof(int)); + switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; - if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + if (tp->rx_opt.user_mss && + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; @@ -3466,13 +3486,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; break; case TCP_LINGER2: - val = tp->linger2; + val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: - val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, - TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); + val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); + val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; @@ -3612,7 +3633,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - val = icsk->icsk_accept_queue.fastopenq.max_qlen; + val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: @@ -3624,14 +3645,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TX_DELAY: - val = tp->tcp_tx_delay; + val = READ_ONCE(tp->tcp_tx_delay); break; case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; case TCP_NOTSENT_LOWAT: - val = tp->notsent_lowat; + val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; @@ -3768,12 +3789,16 @@ static void __tcp_alloc_md5sig_pool(void) * to memory. See smp_rmb() in tcp_get_md5sig_pool() */ smp_wmb(); - tcp_md5sig_pool_populated = true; + /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool() + * and tcp_get_md5sig_pool(). + */ + WRITE_ONCE(tcp_md5sig_pool_populated, true); } bool tcp_alloc_md5sig_pool(void) { - if (unlikely(!tcp_md5sig_pool_populated)) { + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { mutex_lock(&tcp_md5sig_mutex); if (!tcp_md5sig_pool_populated) { @@ -3784,7 +3809,8 @@ bool tcp_alloc_md5sig_pool(void) mutex_unlock(&tcp_md5sig_mutex); } - return tcp_md5sig_pool_populated; + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + return READ_ONCE(tcp_md5sig_pool_populated); } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -3800,7 +3826,8 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { local_bh_disable(); - if (tcp_md5sig_pool_populated) { + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + if (READ_ONCE(tcp_md5sig_pool_populated)) { /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ smp_rmb(); return this_cpu_ptr(&tcp_md5sig_pool); @@ -3882,7 +3909,7 @@ void tcp_done(struct sock *sk) if (req) reqsk_fastopen_remove(sk, req, false); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index bcc13368c836..ca49d68a0e04 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -203,8 +203,11 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, tmp->sg.end = i; if (apply) { apply_bytes -= size; - if (!apply_bytes) + if (!apply_bytes) { + if (sge->length) + sk_msg_iter_var_prev(i); break; + } } } while (i != msg->sg.end); @@ -311,8 +314,8 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; - u32 tosend, delta = 0; - u32 eval = __SK_NONE; + u32 tosend, origsize, sent, delta = 0; + u32 eval; int ret; more_data: @@ -343,6 +346,7 @@ more_data: tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; + eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: @@ -366,10 +370,12 @@ more_data: cork = true; psock->cork = NULL; } - sk_msg_return(sk, msg, msg->sg.size); + sk_msg_return(sk, msg, tosend); release_sock(sk); + origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); @@ -408,7 +414,7 @@ more_data: msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) { if (eval == __SK_REDIRECT) - sk_mem_charge(sk, msg->sg.size); + sk_mem_charge(sk, tosend - sent); goto more_data; } } diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 709d23801823..56dede4b59d9 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -375,6 +375,7 @@ static void tcp_cdg_init(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->gradients = NULL; /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), @@ -388,6 +389,7 @@ static void tcp_cdg_release(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); kfree(ca->gradients); + ca->gradients = NULL; } static struct tcp_congestion_ops tcp_cdg __read_mostly = { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 21705b2ddaff..35088cd30840 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -312,6 +312,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; + int max_qlen; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying @@ -324,10 +325,11 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; - if (fastopenq->max_qlen == 0) + max_qlen = READ_ONCE(fastopenq->max_qlen); + if (max_qlen == 0) return false; - if (fastopenq->qlen >= fastopenq->max_qlen) { + if (fastopenq->qlen >= max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c151c4dd4ae6..61243531a7f4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -178,6 +178,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) if (unlikely(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE)) tcp_gro_dev_warn(sk, skb, len); + /* If the skb has a len of exactly 1*MSS and has the PSH bit + * set then it is likely the end of an application write. So + * more data may not be arriving soon, and yet the data sender + * may be waiting for an ACK if cwnd-bound or using TX zero + * copy. So we set ICSK_ACK_PUSHED here so that + * tcp_cleanup_rbuf() will send an ACK immediately if the app + * reads all of the data and is not ping-pong. If len > MSS + * then this logic does not matter (and does not hurt) because + * tcp_cleanup_rbuf() will always ACK immediately if the app + * reads data and there is more than an MSS of unACKed data. + */ + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -222,7 +235,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -230,7 +243,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } -EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -439,7 +451,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) */ void tcp_init_buffer_space(struct sock *sk) { - int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; + int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -2030,7 +2042,7 @@ void tcp_enter_loss(struct sock *sk) * loss recovery is underway except recurring timeout(s) on * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing */ - tp->frto = net->ipv4.sysctl_tcp_frto && + tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } @@ -2045,15 +2057,17 @@ void tcp_enter_loss(struct sock *sk) * restore sanity to the SACK scoreboard. If the apparent reneging * persists until this RTO then we'll clear the SACK scoreboard. */ -static bool tcp_check_sack_reneging(struct sock *sk, int flag) +static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) { - if (flag & FLAG_SACK_RENEGING) { + if (*ack_flag & FLAG_SACK_RENEGING && + *ack_flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } return false; @@ -2384,6 +2398,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp) return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + return true; + } + return false; +} + /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { @@ -2406,14 +2435,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) } else if (tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_persist--; } - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { - /* Hold old state until something *above* high_seq - * is ACKed. For Reno it is MUST to prevent false - * fast retransmits (RFC2582). SACK TCP is safe. */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) return true; - } tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; return false; @@ -2449,6 +2472,8 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) + return true; if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; @@ -2816,7 +2841,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tcp_check_sack_reneging(sk, flag)) + if (tcp_check_sack_reneging(sk, ack_flag)) return; /* C. Check consistency of the current state. */ @@ -2914,7 +2939,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { - u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; + u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; struct tcp_sock *tp = tcp_sk(sk); if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { @@ -3433,16 +3458,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); + /* Paired with the WRITE_ONCE() in this function. */ + u32 val = READ_ONCE(*last_oow_ack_time); + + if (val) { + s32 elapsed = (s32)(tcp_jiffies32 - val); - if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { + if (0 <= elapsed && + elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } } - *last_oow_ack_time = tcp_jiffies32; + /* Paired with the prior READ_ONCE() and with itself, + * as we might be lockless. + */ + WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32); return false; /* not rate-limited: go ahead, send dupack now! */ } @@ -3483,11 +3515,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; - if (now != challenge_timestamp) { - u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; + if (now != READ_ONCE(challenge_timestamp)) { + u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); u32 half = (ack_limit + 1) >> 1; - challenge_timestamp = now; + WRITE_ONCE(challenge_timestamp, now); WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); } count = READ_ONCE(challenge_count); @@ -3625,8 +3657,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * then we can probably ignore it. */ if (before(ack, prior_snd_una)) { + u32 max_window; + + /* do not accept ACK for bytes we never sent. */ + max_window = min_t(u64, tp->max_window, tp->bytes_acked); /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ - if (before(ack, prior_snd_una - tp->max_window)) { + if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) tcp_send_challenge_ack(sk, skb); return -1; @@ -4180,7 +4216,7 @@ void tcp_fin(struct sock *sk) inet_csk_schedule_ack(sk); - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { @@ -4260,7 +4296,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4306,7 +4342,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; tcp_rcv_spurious_retrans(sk, skb); @@ -5302,7 +5338,7 @@ send_now: } if (!tcp_is_sack(tp) || - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { @@ -5325,7 +5361,8 @@ send_now: if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + delay = min_t(unsigned long, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), @@ -5603,7 +5640,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) trace_tcp_probe(sk, skb); tcp_mstamp_refresh(tp); - if (unlikely(!sk->sk_rx_dst)) + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. @@ -6143,22 +6180,23 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; /* If we are still handling the SYNACK RTO, see if timestamp ECR allows * undo. If peer SACKs triggered fast recovery, we can't undo here. */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) - tcp_try_undo_loss(sk, false); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out) + tcp_try_undo_recovery(sk); /* Reset rtx states to prevent spurious retransmits_timed_out() */ - tcp_sk(sk)->retrans_stamp = 0; + tp->retrans_stamp = 0; inet_csk(sk)->icsk_retransmits = 0; /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ - req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); reqsk_fastopen_remove(sk, req, false); @@ -6316,7 +6354,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); sk_dst_confirm(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b95e1a3487c8..a54505c29a5c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -323,6 +323,8 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; @@ -1570,15 +1572,18 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) struct sock *rsk; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst; + + dst = rcu_dereference_protected(sk->sk_rx_dst, + lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || !dst->ops->check(dst, 0)) { + RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); - sk->sk_rx_dst = NULL; } } tcp_rcv_established(sk, skb); @@ -1653,7 +1658,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { - struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); @@ -2059,7 +2064,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { - sk->sk_rx_dst = dst; + rcu_assign_pointer(sk->sk_rx_dst, dst); inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 9a7d8a599857..627e3be0f754 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -40,7 +40,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; - possible_net_t tcpm_net; + struct net *tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; @@ -51,34 +51,38 @@ struct tcp_metrics_block { struct rcu_head rcu_head; }; -static inline struct net *tm_net(struct tcp_metrics_block *tm) +static inline struct net *tm_net(const struct tcp_metrics_block *tm) { - return read_pnet(&tm->tcpm_net); + /* Paired with the WRITE_ONCE() in tcpm_new() */ + return READ_ONCE(tm->tcpm_net); } static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { - return tm->tcpm_lock & (1 << idx); + /* Paired with WRITE_ONCE() in tcpm_suck_dst() */ + return READ_ONCE(tm->tcpm_lock) & (1 << idx); } -static u32 tcp_metric_get(struct tcp_metrics_block *tm, +static u32 tcp_metric_get(const struct tcp_metrics_block *tm, enum tcp_metric_index idx) { - return tm->tcpm_vals[idx]; + /* Paired with WRITE_ONCE() in tcp_metric_set() */ + return READ_ONCE(tm->tcpm_vals[idx]); } static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) { - tm->tcpm_vals[idx] = val; + /* Paired with READ_ONCE() in tcp_metric_get() */ + WRITE_ONCE(tm->tcpm_vals[idx], val); } static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - return inetpeer_addr_cmp(a, b) == 0; + return (a->family == b->family) && !inetpeer_addr_cmp(a, b); } struct tcpm_hash_bucket { @@ -89,6 +93,7 @@ static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; static unsigned int tcp_metrics_hash_log __read_mostly; static DEFINE_SPINLOCK(tcp_metrics_lock); +static DEFINE_SEQLOCK(fastopen_seqlock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, const struct dst_entry *dst, @@ -97,7 +102,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, u32 msval; u32 val; - tm->tcpm_stamp = jiffies; + WRITE_ONCE(tm->tcpm_stamp, jiffies); val = 0; if (dst_metric_locked(dst, RTAX_RTT)) @@ -110,30 +115,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, val |= 1 << TCP_METRIC_CWND; if (dst_metric_locked(dst, RTAX_REORDERING)) val |= 1 << TCP_METRIC_REORDERING; - tm->tcpm_lock = val; + /* Paired with READ_ONCE() in tcp_metric_locked() */ + WRITE_ONCE(tm->tcpm_lock, val); msval = dst_metric_raw(dst, RTAX_RTT); - tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC); msval = dst_metric_raw(dst, RTAX_RTTVAR); - tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; - tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); - tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); - tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); + tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC); + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + dst_metric_raw(dst, RTAX_SSTHRESH)); + tcp_metric_set(tm, TCP_METRIC_CWND, + dst_metric_raw(dst, RTAX_CWND)); + tcp_metric_set(tm, TCP_METRIC_REORDERING, + dst_metric_raw(dst, RTAX_REORDERING)); if (fastopen_clear) { + write_seqlock(&fastopen_seqlock); tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; tm->tcpm_fastopen.try_exp = 0; tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; + write_sequnlock(&fastopen_seqlock); } } #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +static void tcpm_check_stamp(struct tcp_metrics_block *tm, + const struct dst_entry *dst) { - if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + unsigned long limit; + + if (!tm) + return; + limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT; + if (unlikely(time_after(jiffies, limit))) tcpm_suck_dst(tm, dst, false); } @@ -174,20 +191,23 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, oldest = deref_locked(tcp_metrics_hash[hash].chain); for (tm = deref_locked(oldest->tcpm_next); tm; tm = deref_locked(tm->tcpm_next)) { - if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) + if (time_before(READ_ONCE(tm->tcpm_stamp), + READ_ONCE(oldest->tcpm_stamp))) oldest = tm; } tm = oldest; } else { - tm = kmalloc(sizeof(*tm), GFP_ATOMIC); + tm = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) goto out_unlock; } - write_pnet(&tm->tcpm_net, net); + /* Paired with the READ_ONCE() in tm_net() */ + WRITE_ONCE(tm->tcpm_net, net); + tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; - tcpm_suck_dst(tm, dst, true); + tcpm_suck_dst(tm, dst, reclaim); if (likely(!reclaim)) { tm->tcpm_next = tcp_metrics_hash[hash].chain; @@ -329,7 +349,7 @@ void tcp_update_metrics(struct sock *sk) int m; sk_dst_confirm(sk); - if (net->ipv4.sysctl_tcp_nometrics_save || !dst) + if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst) return; rcu_read_lock(); @@ -431,7 +451,7 @@ void tcp_update_metrics(struct sock *sk) tp->reordering); } } - tm->tcpm_stamp = jiffies; + WRITE_ONCE(tm->tcpm_stamp, jiffies); out_unlock: rcu_read_unlock(); } @@ -446,11 +466,15 @@ void tcp_init_metrics(struct sock *sk) u32 val, crtt = 0; /* cached RTT scaled by 8 */ sk_dst_confirm(sk); + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; if (!dst) goto reset; rcu_read_lock(); - tm = tcp_get_metrics(sk, dst, true); + tm = tcp_get_metrics(sk, dst, false); if (!tm) { rcu_read_unlock(); goto reset; @@ -464,11 +488,6 @@ void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; - } else { - /* ssthresh may have been reduced unnecessarily during. - * 3WHS. Restore it back to its initial default. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) @@ -534,8 +553,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) return ret; } -static DEFINE_SEQLOCK(fastopen_seqlock); - void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { @@ -642,7 +659,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, } if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, - jiffies - tm->tcpm_stamp, + jiffies - READ_ONCE(tm->tcpm_stamp), TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; @@ -653,7 +670,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { - u32 val = tm->tcpm_vals[i]; + u32 val = tcp_metric_get(tm, i); if (!val) continue; @@ -885,7 +902,7 @@ static void tcp_metrics_flush_all(struct net *net) match = net ? net_eq(tm_net(tm), net) : !refcount_read(&tm_net(tm)->count); if (match) { - *pp = tm->tcpm_next; + rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); } else { pp = &tm->tcpm_next; @@ -926,7 +943,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { - *pp = tm->tcpm_next; + rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); found = true; } else { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 324f43fadb37..e5dc08579cf5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -563,6 +563,9 @@ EXPORT_SYMBOL(tcp_create_openreq_child); * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results + * + * Note: If @fastopen is true, this can be called from process context. + * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, @@ -715,7 +718,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) - __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } @@ -734,7 +737,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 97f29ece3800..6d7f441c7dd7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -179,8 +179,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, - u32 rcv_nxt) +static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -194,7 +193,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ - tcp_dec_quickack_mode(sk, pkts); + tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1104,7 +1103,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_set_hash_from_sk(skb, sk); refcount_add(skb->truesize, &sk->sk_wmem_alloc); - skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); + skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; @@ -1152,7 +1151,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); @@ -1653,15 +1652,20 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); - /* Track the maximum number of outstanding packets in each - * window, and remember whether we were cwnd-limited then. + /* Track the strongest available signal of the degree to which the cwnd + * is fully utilized. If cwnd-limited then remember that fact for the + * current window. If not cwnd-limited then track the maximum number of + * outstanding packets in the current window. (If cwnd-limited then we + * chose to not update tp->max_packets_out to avoid an extra else + * clause with no functional impact.) */ - if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out || - is_cwnd_limited) { - tp->max_packets_out = tp->packets_out; - tp->max_packets_seq = tp->snd_nxt; + if (!before(tp->snd_una, tp->cwnd_usage_seq) || + is_cwnd_limited || + (!tp->is_cwnd_limited && + tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; + tp->max_packets_out = tp->packets_out; + tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { @@ -1761,7 +1765,7 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); @@ -2255,6 +2259,18 @@ static bool tcp_pacing_check(struct sock *sk) return true; } +static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) +{ + const struct rb_node *node = sk->tcp_rtx_queue.rb_node; + + /* No skb in the rtx queue. */ + if (!node) + return true; + + /* Only one skb in rtx queue. */ + return !node->rb_left && !node->rb_right; +} + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) @@ -2276,7 +2292,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && @@ -2292,12 +2308,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send skb if rtx queue is empty. + /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (tcp_rtx_queue_empty(sk)) + if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2500,7 +2516,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, rto_delta_us; + u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS @@ -2524,11 +2540,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { - timeout = usecs_to_jiffies(tp->srtt_us >> 2); + timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) - timeout += TCP_RTO_MIN; + timeout_us += tcp_rto_min_us(sk); else - timeout += TCP_TIMEOUT_MIN; + timeout_us += TCP_TIMEOUT_MIN_US; + timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } @@ -2911,7 +2928,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; - + int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) @@ -2928,7 +2945,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (skb_still_in_host_queue(sk, skb)) return -EBUSY; +start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; + TCP_SKB_CB(skb)->seq++; + goto start; + } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); return -EINVAL; @@ -2941,17 +2964,25 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); + avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case - * our retransmit serves as a zero window probe. + * our retransmit of one segment serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && - TCP_SKB_CB(skb)->seq != tp->snd_una) - return -EAGAIN; + if (avail_wnd <= 0) { + if (TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + avail_wnd = cur_mss; + } len = cur_mss * segs; + if (len > avail_wnd) { + len = rounddown(avail_wnd, cur_mss); + if (!len) + len = avail_wnd; + } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) @@ -2965,8 +2996,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); - if (skb->len < cur_mss) - tcp_retrans_try_collapse(sk, skb, cur_mss); + avail_wnd = min_t(int, avail_wnd, cur_mss); + if (skb->len < avail_wnd) + tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ @@ -3134,11 +3166,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt; + int delta, amt; - if (size <= sk->sk_forward_alloc) + delta = size - sk->sk_forward_alloc; + if (delta <= 0) return; - amt = sk_mem_pages(size); + amt = sk_mem_pages(delta); sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; sk_memory_allocated_add(sk, amt); @@ -3322,7 +3355,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp_ns = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif { @@ -3359,7 +3392,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); - __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 22ec8dcc1428..db3469c95c49 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -123,7 +123,7 @@ bool tcp_rack_mark_lost(struct sock *sk) tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { - timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; + timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a0107eb02ae4..551c4a78f68d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -573,7 +573,9 @@ out_reset_timer: tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; - icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); + icsk->icsk_rto = clamp(__tcp_set_rto(tp), + tcp_rto_min(sk), + TCP_RTO_MAX); } else { /* Use normal (exponential) backoff */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 83fd4fa40d5e..b17b63654812 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1054,16 +1054,17 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = udp_cmsg_send(sk, msg, &ipc.gso_size); - if (err > 0) + if (err > 0) { err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); + connected = 0; + } if (unlikely(err < 0)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; - connected = 0; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; @@ -1528,7 +1529,7 @@ drop: } EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); -void udp_destruct_sock(struct sock *sk) +void udp_destruct_common(struct sock *sk) { /* reclaim completely the forward allocated memory */ struct udp_sock *up = udp_sk(sk); @@ -1541,10 +1542,14 @@ void udp_destruct_sock(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); +} +EXPORT_SYMBOL_GPL(udp_destruct_common); +static void udp_destruct_sock(struct sock *sk) +{ + udp_destruct_common(sk); inet_sock_destruct(sk); } -EXPORT_SYMBOL_GPL(udp_destruct_sock); int udp_init_sock(struct sock *sk) { @@ -1552,7 +1557,6 @@ int udp_init_sock(struct sock *sk) sk->sk_destruct = udp_destruct_sock; return 0; } -EXPORT_SYMBOL_GPL(udp_init_sock); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { @@ -2137,7 +2141,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old; if (dst_hold_safe(dst)) { - old = xchg(&sk->sk_rx_dst, dst); + old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst); dst_release(old); return old != dst; } @@ -2326,7 +2330,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct dst_entry *dst = skb_dst(skb); int ret; - if (unlikely(sk->sk_rx_dst != dst)) + if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); @@ -2484,7 +2488,7 @@ int udp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_efree; - dst = READ_ONCE(sk->sk_rx_dst); + dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); @@ -2676,11 +2680,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - len = min_t(unsigned int, len, sizeof(int)); - if (len < 0) return -EINVAL; + len = min_t(unsigned int, len, sizeof(int)); + switch (optname) { case UDP_CORK: val = READ_ONCE(up->corkflag); diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 150e6f0fdbf5..bbe4eca42d36 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -196,6 +196,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); + synchronize_rcu(); kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 5936d66d1ce2..3a74a3675ca1 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -17,6 +17,14 @@ struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); +/* Designate sk as UDP-Lite socket */ +static int udplite_sk_init(struct sock *sk) +{ + udp_init_sock(sk); + udp_sk(sk)->pcflag = UDPLITE_BIT; + return 0; +} + static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); @@ -54,6 +62,8 @@ struct proto udplite_prot = { .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a0123760fb2c..2720e5d931e8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -313,9 +313,8 @@ static void addrconf_del_dad_work(struct inet6_ifaddr *ifp) static void addrconf_mod_rs_timer(struct inet6_dev *idev, unsigned long when) { - if (!timer_pending(&idev->rs_timer)) + if (!mod_timer(&idev->rs_timer, jiffies + when)) in6_dev_hold(idev); - mod_timer(&idev->rs_timer, jiffies + when); } static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp, @@ -697,6 +696,22 @@ errout: return err; } +/* Combine dev_addr_genid and dev_base_seq to detect changes. + */ +static u32 inet6_base_seq(const struct net *net) +{ + u32 res = atomic_read(&net->ipv6.dev_addr_genid) + + net->dev_base_seq; + + /* Must not return 0 (see nl_dump_check_consistent()). + * Chose a value far away from 0. + */ + if (!res) + res = 0x80000000; + return res; +} + + static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { @@ -730,8 +745,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, idx = 0; head = &net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ - net->dev_base_seq; + cb->seq = inet6_base_seq(net); hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; @@ -1368,7 +1382,7 @@ retry: * idev->desync_factor if it's larger */ cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft); - max_desync_factor = min_t(__u32, + max_desync_factor = min_t(long, idev->cnf.max_desync_factor, cnf_temp_preferred_lft - regen_advance); @@ -2012,9 +2026,10 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { - result = ifp; - in6_ifa_hold(ifp); - break; + if (in6_ifa_hold_safe(ifp)) { + result = ifp; + break; + } } } } @@ -2542,12 +2557,18 @@ static void manage_tempaddrs(struct inet6_dev *idev, ipv6_ifa_notify(0, ift); } - if ((create || list_empty(&idev->tempaddr_list)) && - idev->cnf.use_tempaddr > 0) { + /* Also create a temporary address if it's enabled but no temporary + * address currently exists. + * However, we get called with valid_lft == 0, prefered_lft == 0, create == false + * as part of cleanup (ie. deleting the mngtmpaddr). + * We don't want that to result in creating a new temporary ip address. + */ + if (list_empty(&idev->tempaddr_list) && (valid_lft || prefered_lft)) + create = true; + + if (create && idev->cnf.use_tempaddr > 0) { /* When a new public address is created as described * in [ADDRCONF], also create a new temporary address. - * Also create a temporary address if it's enabled but - * no temporary address currently exists. */ read_unlock_bh(&idev->lock); ipv6_create_tempaddr(ifp, NULL, false); @@ -5227,7 +5248,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, } rcu_read_lock(); - cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq; + cb->seq = inet6_base_seq(tgt_net); for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; @@ -5360,9 +5381,10 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, } addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (!addr) - return -EINVAL; - + if (!addr) { + err = -EINVAL; + goto errout; + } ifm = nlmsg_data(nlh); if (ifm->ifa_index) dev = dev_get_by_index(tgt_net, ifm->ifa_index); @@ -5972,11 +5994,7 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, pmsg->prefix_len = pinfo->prefix_len; pmsg->prefix_type = pinfo->type; pmsg->prefix_pad3 = 0; - pmsg->prefix_flags = 0; - if (pinfo->onlink) - pmsg->prefix_flags |= IF_PREFIX_ONLINK; - if (pinfo->autoconf) - pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; + pmsg->prefix_flags = pinfo->flags; if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix)) goto nla_put_failure; diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index ea00ce3d4117..8494ee9679b4 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -204,19 +204,26 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { EXPORT_SYMBOL_GPL(ipv6_stub); /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ -const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; +const struct in6_addr in6addr_loopback __aligned(BITS_PER_LONG/8) + = IN6ADDR_LOOPBACK_INIT; EXPORT_SYMBOL(in6addr_loopback); -const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +const struct in6_addr in6addr_any __aligned(BITS_PER_LONG/8) + = IN6ADDR_ANY_INIT; EXPORT_SYMBOL(in6addr_any); -const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +const struct in6_addr in6addr_linklocal_allnodes __aligned(BITS_PER_LONG/8) + = IN6ADDR_LINKLOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_linklocal_allnodes); -const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_linklocal_allrouters __aligned(BITS_PER_LONG/8) + = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_linklocal_allrouters); -const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; +const struct in6_addr in6addr_interfacelocal_allnodes __aligned(BITS_PER_LONG/8) + = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allnodes); -const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_interfacelocal_allrouters __aligned(BITS_PER_LONG/8) + = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allrouters); -const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_sitelocal_allrouters __aligned(BITS_PER_LONG/8) + = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_sitelocal_allrouters); static void snmp6_free_dev(struct inet6_dev *idev) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 8a22486cf270..17ac45aa7194 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -437,6 +437,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal = nlmsg_data(nlh); ifal->ifal_family = AF_INET6; + ifal->__ifal_reserved = 0; ifal->ifal_prefixlen = prefixlen; ifal->ifal_flags = 0; ifal->ifal_index = ifindex; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 56f396ecc26b..a5fc9444c728 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -104,6 +104,13 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } +void inet6_sock_destruct(struct sock *sk) +{ + inet6_cleanup_sock(sk); + inet_sock_destruct(sk); +} +EXPORT_SYMBOL_GPL(inet6_sock_destruct); + static int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -196,7 +203,7 @@ lookup_protocol: inet->hdrincl = 1; } - sk->sk_destruct = inet_sock_destruct; + sk->sk_destruct = inet6_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; @@ -498,6 +505,12 @@ void inet6_destroy_sock(struct sock *sk) } EXPORT_SYMBOL_GPL(inet6_destroy_sock); +void inet6_cleanup_sock(struct sock *sk) +{ + inet6_destroy_sock(sk); +} +EXPORT_SYMBOL_GPL(inet6_cleanup_sock); + /* * This does both peername and sockname. */ diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 390bedde21a5..06261603e083 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -50,7 +50,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; - fl6->flowlabel = np->flow_label; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!fl6->flowi6_oif) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b64791d3b0f8..a1cdb43e7216 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -506,7 +506,9 @@ static inline int esp_remove_trailer(struct sk_buff *skb) skb->csum = csum_block_sub(skb->csum, csumdiff, skb->len - trimlen); } - pskb_trim(skb, skb->len - trimlen); + ret = pskb_trim(skb, skb->len - trimlen); + if (unlikely(ret)) + return ret; ret = nexthdr[1]; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 1c532638b2ad..e19c1844276f 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -314,6 +314,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index da46c4284676..49e31e4ae7b7 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -143,6 +143,8 @@ int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) optlen = 1; break; default: + if (len < 2) + goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 172726939652..cdc8a49d7fc3 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -437,6 +437,11 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(16); /* src */ } +static void fib6_rule_flush_cache(struct fib_rules_ops *ops) +{ + rt_genid_bump_ipv6(ops->fro_net); +} + static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), @@ -449,6 +454,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, + .flush_cache = fib6_rule_flush_cache, .nlgroup = RTNLGRP_IPV6_RULE, .policy = fib6_rule_policy, .owner = THIS_MODULE, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3db10cae7b17..169467b5c98a 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -410,7 +410,10 @@ static struct net_device *icmp6_dev(const struct sk_buff *skb) if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); - if (rt6) + /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.), + * and ip6_null_entry could be set to skb if no route is found. + */ + if (rt6 && rt6->rt6i_idev) dev = rt6->rt6i_idev->dev; } diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 5fc1f4e0c0cf..10f1367eb4ca 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -477,6 +477,7 @@ int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); + ret = -ESRCH; ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ef55489651f8..3afc32fe9b07 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -591,19 +591,19 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (!w) { /* New dump: * - * 1. hook callback destructor. - */ - cb->args[3] = (long)cb->done; - cb->done = fib6_dump_done; - - /* - * 2. allocate and initialize walker. + * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; + + /* 2. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + } arg.skb = skb; @@ -1307,7 +1307,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; - struct fib6_node *fn, *pn = NULL; + struct fib6_node *fn; +#ifdef CONFIG_IPV6_SUBTREES + struct fib6_node *pn = NULL; +#endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; @@ -1331,9 +1334,9 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, goto out; } +#ifdef CONFIG_IPV6_SUBTREES pn = fn; -#ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; @@ -1433,13 +1436,9 @@ out: if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); + if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; - } -#endif fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index d64b83e85642..e6e1190a6ed9 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -535,7 +535,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) { - int uninitialized_var(err); + int err; struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_flowlabel_req freq; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4a6396d574a0..de707e057cd9 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -531,6 +531,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, struct ip6_tnl *tunnel; u8 ver; + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; @@ -711,6 +714,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, { struct ip6_tnl *tunnel = netdev_priv(dev); __be16 protocol; + __be16 flags; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -720,16 +724,12 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) - return -ENOMEM; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; if (tunnel->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; - __be16 flags; int tun_hlen; tun_info = skb_tunnel_info(skb); @@ -751,19 +751,25 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tun_hlen = gre_calc_hlen(flags); + if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) + return -ENOMEM; + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); } else { - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; + if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) + return -ENOMEM; + + flags = tunnel->parms.o_flags; - gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - htonl(tunnel->o_seqno)); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) + : 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, @@ -938,11 +944,12 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); + if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) + goto tx_err; truncate = true; } - nhoff = skb_network_header(skb) - skb_mac_header(skb); + nhoff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP) && (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; @@ -951,7 +958,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, int thoff; if (skb_transport_header_was_set(skb)) - thoff = skb_transport_header(skb) - skb_mac_header(skb); + thoff = skb_transport_offset(skb); else thoff = nhoff + sizeof(struct ipv6hdr); if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) @@ -1000,12 +1007,14 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, ntohl(tun_id), ntohl(md->u.index), truncate, false); + proto = htons(ETH_P_ERSPAN); } else if (md->version == 2) { erspan_build_header_v2(skb, ntohl(tun_id), md->u.md2.dir, get_hwid(&md->u.md2), truncate, false); + proto = htons(ETH_P_ERSPAN2); } else { goto tx_err; } @@ -1028,25 +1037,26 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, break; } - if (t->parms.erspan_ver == 1) + if (t->parms.erspan_ver == 1) { erspan_build_header(skb, ntohl(t->parms.o_key), t->parms.index, truncate, false); - else if (t->parms.erspan_ver == 2) + proto = htons(ETH_P_ERSPAN); + } else if (t->parms.erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(t->parms.o_key), t->parms.dir, t->parms.hwid, truncate, false); - else + proto = htons(ETH_P_ERSPAN2); + } else { goto tx_err; + } fl6.daddr = t->parms.raddr; } /* Push GRE header. */ - proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN) - : htons(ETH_P_ERSPAN2); - gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(t->o_seqno++)); + gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) @@ -1137,14 +1147,16 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, dev->needed_headroom = dst_len; if (set_mtu) { - dev->mtu = rt->dst.dev->mtu - t_hlen; + int mtu = rt->dst.dev->mtu - t_hlen; + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; + mtu -= 8; if (dev->type == ARPHRD_ETHER) - dev->mtu -= ETH_HLEN; + mtu -= ETH_HLEN; - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + WRITE_ONCE(dev->mtu, mtu); } } ip6_rt_put(rt); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index e6c4966aa956..ebf90bce063a 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -44,21 +44,25 @@ #include <net/inet_ecn.h> #include <net/dst_metadata.h> -INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *)); +void udp_v6_early_demux(struct sk_buff *); +void tcp_v6_early_demux(struct sk_buff *); static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) { - void (*edemux)(struct sk_buff *skb); - - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { - const struct inet6_protocol *ipprot; - - ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); - if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) - INDIRECT_CALL_2(edemux, tcp_v6_early_demux, - udp_v6_early_demux, skb); + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && + !skb_dst(skb) && !skb->sk) { + switch (ipv6_hdr(skb)->nexthdr) { + case IPPROTO_TCP: + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) + tcp_v6_early_demux(skb); + break; + case IPPROTO_UDP: + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) + udp_v6_early_demux(skb); + break; + } } + if (!skb_valid_dst(skb)) ip6_route_input(skb); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5585e3a94f3c..c67d634dccd4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -130,7 +130,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); - if (res < 0 || res == LWTUNNEL_XMIT_DONE) + if (res != LWTUNNEL_XMIT_CONTINUE) return res; } @@ -177,7 +177,13 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, int err; skb_mark_not_on_list(segs); - err = ip6_fragment(net, sk, segs, ip6_finish_output2); + /* Last GSO segment can be smaller than gso_size (and MTU). + * Adding a fragment header would produce an "atomic fragment", + * which is considered harmful (RFC-8021). Avoid that. + */ + err = segs->len > mtu ? + ip6_fragment(net, sk, segs, ip6_finish_output2) : + ip6_finish_output2(net, sk, segs); if (err && ret == 0) ret = err; } @@ -919,6 +925,9 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (err < 0) goto fail; + /* We prevent @rt from being freed. */ + rcu_read_lock(); + for (;;) { /* Prepare header of the next frame, * before previous one went down. */ @@ -942,6 +951,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); + rcu_read_unlock(); return 0; } @@ -949,6 +959,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); + rcu_read_unlock(); return err; slow_path_clean: @@ -1414,7 +1425,7 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; - if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + if (cork->tx_flags & SKBTX_ANY_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; @@ -1850,8 +1861,13 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + u8 icmp6_type; - ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); + if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl) + icmp6_type = fl6->fl6_icmp_type; + else + icmp6_type = icmp6_hdr(skb)->icmp6_type; + ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 878a08c40fff..5319093d9aa6 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -399,7 +399,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); - u8 next, nexthdr = ipv6h->nexthdr; + u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; @@ -410,25 +410,25 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { - struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; - if (frag_hdr->frag_off) - break; optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } - /* cache hdr->nexthdr, since pskb_may_pull() might - * invalidate hdr - */ - next = hdr->nexthdr; - if (nexthdr == NEXTHDR_DEST) { - u16 i = 2; - /* Remember : hdr is no longer valid at this point. */ - if (!pskb_may_pull(skb, off + optlen)) + if (!pskb_may_pull(skb, off + optlen)) + break; + + hdr = (struct ipv6_opt_hdr *)(skb->data + off); + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; + + if (frag_hdr->frag_off) break; + } + if (nexthdr == NEXTHDR_DEST) { + u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; @@ -449,7 +449,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) i++; } } - nexthdr = next; + nexthdr = hdr->nexthdr; off += optlen; } return 0; @@ -1201,8 +1201,8 @@ route_lookup: */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; + if (max_headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) @@ -1430,6 +1430,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; + int mtu; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); @@ -1472,12 +1473,13 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) dev->hard_header_len = rt->dst.dev->hard_header_len + t_hlen; - dev->mtu = rt->dst.dev->mtu - t_hlen; + mtu = rt->dst.dev->mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; + mtu -= 8; - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + WRITE_ONCE(dev->mtu, mtu); } ip6_rt_put(rt); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 8b44d3b53844..e4cd6909e9bb 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -558,12 +558,12 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; - xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + xfrm_decode_session(skb, &fl, AF_INET6); break; case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + xfrm_decode_session(skb, &fl, AF_INET); break; default: goto tx_err; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6248e00c2bf7..6642bc7b9870 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1065,7 +1065,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ - skb_push(skb, -skb_network_offset(pkt)); + __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 5352c7e68c42..86c550d63543 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -164,15 +164,18 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, rtnl_lock(); lock_sock(sk); + /* Another thread has converted the socket into IPv4 with + * IPV6_ADDRFORM concurrently. + */ + if (unlikely(sk->sk_family != AF_INET6)) + goto unlock; + switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { - struct ipv6_txoptions *opt; - struct sk_buff *pktopt; - if (sk->sk_type == SOCK_RAW) break; @@ -203,7 +206,6 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; } - fl6_free_socklist(sk); __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); @@ -238,14 +240,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg((__force struct ipv6_txoptions **)&np->opt, - NULL); - if (opt) { - atomic_sub(opt->tot_len, &sk->sk_omem_alloc); - txopt_put(opt); - } - pktopt = xchg(&np->pktoptions, NULL); - kfree_skb(pktopt); + + /* Disable all options not to allocate memory anymore, + * but there is still a race. See the lockless path + * in udpv6_sendmsg() and ipv6_local_rxpmtu(). + */ + np->rxopt.all = 0; + + inet6_cleanup_sock(sk); /* * ... and add it to the refcnt debug socks count @@ -924,6 +926,7 @@ pref_skip_coa: break; } +unlock: release_sock(sk); if (needs_rtnl) rtnl_unlock(); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 118e19cabb72..74977ec77c57 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -196,7 +196,8 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { - return opt->nd_opt_type == ND_OPT_RDNSS || + return opt->nd_opt_type == ND_OPT_PREFIX_INFO || + opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || ndisc_ops_is_useropt(dev, opt->nd_opt_type); diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 69c021704abd..aa5bb8789ba0 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -97,7 +97,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, const struct net_device *indev) { - __be16 uninitialized_var(dport), uninitialized_var(sport); + __be16 dport, sport; const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 34d51cd426b0..00761924f276 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -63,7 +63,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule_put(inet_twsk(sk)); + nf_tproxy_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 2af32200507d..c4aa8d27e040 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -13,8 +13,8 @@ #include <net/netfilter/ipv6/nf_dup_ipv6.h> struct nft_dup_ipv6 { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; + u8 sreg_addr; + u8 sreg_dev; }; static void nft_dup_ipv6_eval(const struct nft_expr *expr, @@ -38,16 +38,16 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr)); + err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + sizeof(struct in6_addr)); if (err < 0) return err; - if (tb[NFTA_DUP_SREG_DEV] != NULL) { - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); - } - return 0; + if (tb[NFTA_DUP_SREG_DEV]) + err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + &priv->sreg_dev, sizeof(int)); + + return err; } static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 7ece86afd079..03dbd16f9ad5 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -37,6 +37,9 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); + } else if ((priv->flags & NFTA_FIB_F_IIF) && + (netif_is_l3_master(dev) || netif_is_l3_slave(dev))) { + fl6->flowi6_oif = dev->ifindex; } if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) @@ -179,7 +182,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev) + if (oif && oif != rt->rt6i_idev->dev && + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) goto put_rt_err; nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 98ac32b49d8c..91f352427dca 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -96,7 +96,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) addr_type = ipv6_addr_type(daddr); if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || (addr_type & IPV6_ADDR_MAPPED) || - (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if)) + (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if && + l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if)) return -EINVAL; /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 828dd95840b4..731485b18de3 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -539,6 +539,7 @@ csum_copy_err: static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { + struct ipv6_txoptions *opt; struct sk_buff *skb; int err = 0; int offset; @@ -556,6 +557,9 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; + opt = inet6_sk(sk)->cork.opt; + total_len -= opt ? opt->opt_flen : 0; + if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); @@ -824,7 +828,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!proto) proto = inet->inet_num; - else if (proto != inet->inet_num) + else if (proto != inet->inet_num && + inet->inet_num != IPPROTO_RAW) return -EINVAL; if (proto > 255) @@ -1252,8 +1257,6 @@ static void raw6_destroy(struct sock *sk) lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); - - inet6_destroy_sock(sk); } static int rawv6_init_sk(struct sock *sk) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 00732ee6bbd8..8eac2c890449 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -88,7 +88,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); +static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -3207,29 +3207,30 @@ out: return dst; } -static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); - if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (entries > ops->gc_thresh) + entries = dst_entries_get_slow(ops); + + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; - return entries > rt_max_size; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, @@ -5217,19 +5218,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->fib6_info, info, extack); - fib6_info_release(nh->fib6_info); - if (!err) { - /* save reference to last route successfully inserted */ - rt_last = nh->fib6_info; - - /* save reference to first route for notification */ - if (!rt_notif) - rt_notif = nh->fib6_info; - } - - /* nh->fib6_info is used or freed at this point, reset to NULL*/ - nh->fib6_info = NULL; if (err) { if (replace && nhn) NL_SET_ERR_MSG_MOD(extack, @@ -5237,6 +5226,12 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = nh; goto add_errout; } + /* save reference to last route successfully inserted */ + rt_last = nh->fib6_info; + + /* save reference to first route for notification */ + if (!rt_notif) + rt_notif = nh->fib6_info; /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, @@ -5283,8 +5278,7 @@ add_errout: cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { - if (nh->fib6_info) - fib6_info_release(nh->fib6_info); + fib6_info_release(nh->fib6_info); list_del(&nh->next); kfree(nh); } @@ -5408,16 +5402,17 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i) nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); } else { + struct fib6_info *sibling, *next_sibling; struct fib6_nh *nh = f6i->fib6_nh; nexthop_len = 0; if (f6i->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(nh->fib_nh_lws); + rt6_nh_nlmsg_size(nh, &nexthop_len); - nexthop_len *= f6i->fib6_nsiblings; + list_for_each_entry_safe(sibling, next_sibling, + &f6i->fib6_siblings, fib6_siblings) { + rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); + } } nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } @@ -6316,7 +6311,7 @@ static int __net_init ip6_route_net_init(struct net *net) #endif net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; @@ -6325,7 +6320,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: @@ -6359,10 +6354,16 @@ static void __net_exit ip6_route_net_exit(struct net *net) static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS - proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, - sizeof(struct ipv6_route_iter)); - proc_create_net_single("rt6_stats", 0444, net->proc_net, - rt6_stats_seq_show, NULL); + if (!proc_create_net("ipv6_route", 0, net->proc_net, + &ipv6_route_seq_ops, + sizeof(struct ipv6_route_iter))) + return -ENOMEM; + + if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, + rt6_stats_seq_show, NULL)) { + remove_proc_entry("ipv6_route", net->proc_net); + return -ENOMEM; + } #endif return 0; } diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 75421a472d25..7094f8691ac6 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -125,6 +125,11 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } + if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) { + err = -EINVAL; + goto out_unlock; + } + if (hinfo) { err = seg6_hmac_info_del(net, hmackeyid); if (err) @@ -436,22 +441,24 @@ int __init seg6_init(void) { int err = -ENOMEM; - err = genl_register_family(&seg6_genl_family); + err = register_pernet_subsys(&ip6_segments_ops); if (err) goto out; - err = register_pernet_subsys(&ip6_segments_ops); + err = genl_register_family(&seg6_genl_family); if (err) - goto out_unregister_genl; + goto out_unregister_pernet; #ifdef CONFIG_IPV6_SEG6_LWTUNNEL err = seg6_iptunnel_init(); if (err) - goto out_unregister_pernet; + goto out_unregister_genl; err = seg6_local_init(); - if (err) - goto out_unregister_pernet; + if (err) { + seg6_iptunnel_exit(); + goto out_unregister_genl; + } #endif #ifdef CONFIG_IPV6_SEG6_HMAC @@ -472,11 +479,11 @@ out_unregister_iptun: #endif #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL -out_unregister_pernet: - unregister_pernet_subsys(&ip6_segments_ops); -#endif out_unregister_genl: genl_unregister_family(&seg6_genl_family); +#endif +out_unregister_pernet: + unregister_pernet_subsys(&ip6_segments_ops); goto out; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 117d374695fe..8d704ea94693 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1054,12 +1054,13 @@ tx_err: static void ipip6_tunnel_bind_dev(struct net_device *dev) { + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; + int hlen = LL_MAX_HEADER; const struct iphdr *iph; struct flowi4 fl4; - tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { @@ -1082,12 +1083,15 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev && !netif_is_l3_master(tdev)) { - int t_hlen = tunnel->hlen + sizeof(struct iphdr); + int mtu; - dev->mtu = tdev->mtu - t_hlen; - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; + mtu = tdev->mtu - t_hlen; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + WRITE_ONCE(dev->mtu, mtu); + hlen = tdev->hard_header_len + tdev->needed_headroom; } + dev->needed_headroom = t_hlen + hlen; } static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7e5550546594..8c3beffbaf06 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -180,14 +180,15 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->af_specific = &tcp_request_sock_ipv6_ops; treq->tfo_listener = false; - if (security_inet_conn_request(sk, skb, req)) - goto out_free; - req->mss = mss; ireq->ir_rmt_port = th->source; ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + + if (security_inet_conn_request(sk, skb, req)) + goto out_free; + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 063898cae3e5..7cb622d300aa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -106,7 +106,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - sk->sk_rx_dst = dst; + rcu_assign_pointer(sk->sk_rx_dst, dst); inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } @@ -264,6 +264,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; @@ -334,6 +335,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; @@ -1316,14 +1319,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_mask(sk, GFP_ATOMIC)); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) { + if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); - skb_set_owner_r(newnp->pktoptions, newsk); - } } } else { if (!req_unhash && found_dup_sk) { @@ -1391,18 +1391,21 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst; + + dst = rcu_dereference_protected(sk->sk_rx_dst, + lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); - sk->sk_rx_dst = NULL; } } @@ -1470,7 +1473,6 @@ ipv6_pktoptions: if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { @@ -1726,7 +1728,7 @@ do_time_wait: goto discard_it; } -INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) +void tcp_v6_early_demux(struct sk_buff *skb) { const struct ipv6hdr *hdr; const struct tcphdr *th; @@ -1753,7 +1755,7 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { - struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie); @@ -1846,12 +1848,6 @@ static int tcp_v6_init_sock(struct sock *sk) return 0; } -static void tcp_v6_destroy_sock(struct sock *sk) -{ - tcp_v4_destroy_sock(sk); - inet6_destroy_sock(sk); -} - #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, @@ -2044,7 +2040,7 @@ struct proto tcpv6_prot = { .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, - .destroy = tcp_v6_destroy_sock, + .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, @@ -2081,12 +2077,7 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct inet6_protocol tcpv6_protocol = { - .early_demux = tcp_v6_early_demux, - .early_demux_handler = tcp_v6_early_demux, +static const struct inet6_protocol tcpv6_protocol = { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 040869f45682..e6fdb842e89d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -54,6 +54,19 @@ #include <trace/events/skb.h> #include "udp_impl.h" +static void udpv6_destruct_sock(struct sock *sk) +{ + udp_destruct_common(sk); + inet6_sock_destruct(sk); +} + +int udpv6_init_sock(struct sock *sk) +{ + skb_queue_head_init(&udp_sk(sk)->reader_queue); + sk->sk_destruct = udpv6_destruct_sock; + return 0; +} + static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -74,7 +87,7 @@ static u32 udp6_ehashfn(const struct net *net, fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, - udp_ipv6_hash_secret + net_hash_mix(net)); + udp6_ehash_secret + net_hash_mix(net)); } int udp_v6_get_port(struct sock *sk, unsigned short snum) @@ -889,7 +902,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct dst_entry *dst = skb_dst(skb); int ret; - if (unlikely(sk->sk_rx_dst != dst)) + if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp6_sk_rx_dst_set(sk, dst); if (!uh->check && !udp_sk(sk)->no_check6_rx) { @@ -973,7 +986,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, return NULL; } -INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) +void udp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct udphdr *uh; @@ -1001,7 +1014,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_efree; - dst = READ_ONCE(sk->sk_rx_dst); + dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); @@ -1283,9 +1296,11 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) msg->msg_name = &sin; msg->msg_namelen = sizeof(sin); do_udp_sendmsg: - if (__ipv6_only_sock(sk)) - return -ENETUNREACH; - return udp_sendmsg(sk, msg, len); + err = __ipv6_only_sock(sk) ? + -ENETUNREACH : udp_sendmsg(sk, msg, len); + msg->msg_name = sin6; + msg->msg_namelen = addr_len; + return err; } } @@ -1372,9 +1387,11 @@ do_udp_sendmsg: ipc6.opt = opt; err = udp_cmsg_send(sk, msg, &ipc6.gso_size); - if (err > 0) + if (err > 0) { err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); + connected = false; + } if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1386,7 +1403,6 @@ do_udp_sendmsg: } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; - connected = false; } if (!opt) { opt = txopt_get(np); @@ -1558,8 +1574,6 @@ void udpv6_destroy_sock(struct sock *sk) udp_encap_disable(); } } - - inet6_destroy_sock(sk); } /* @@ -1603,12 +1617,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif -/* thinking of making this const? Don't. - * early_demux can change based on sysctl. - */ -static struct inet6_protocol udpv6_protocol = { - .early_demux = udp_v6_early_demux, - .early_demux_handler = udp_v6_early_demux, +static const struct inet6_protocol udpv6_protocol = { .handler = udpv6_rcv, .err_handler = udpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, @@ -1668,7 +1677,7 @@ struct proto udpv6_prot = { .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, - .init = udp_init_sock, + .init = udpv6_init_sock, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 20e324b6f358..16516bd69250 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -12,6 +12,7 @@ int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, __be32, struct udp_table *); +int udpv6_init_sock(struct sock *sk); int udp_v6_get_port(struct sock *sk, unsigned short snum); void udp_v6_rehash(struct sock *sk); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index bf7a7acd39b1..6f6215f4d81d 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -12,6 +12,13 @@ #include <linux/proc_fs.h> #include "udp_impl.h" +static int udplitev6_sk_init(struct sock *sk) +{ + udpv6_init_sock(sk); + udp_sk(sk)->pcflag = UDPLITE_BIT; + return 0; +} + static int udplitev6_rcv(struct sk_buff *skb) { return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); @@ -38,7 +45,7 @@ struct proto udplitev6_prot = { .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, - .init = udplite_sk_init, + .init = udplitev6_sk_init, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, @@ -50,6 +57,8 @@ struct proto udplitev6_prot = { .get_port = udp_v6_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index af7a4b8b1e9c..4c3aa97f23fa 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -120,11 +120,11 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - if (likely(xdst->u.rt6.rt6i_idev)) - in6_dev_put(xdst->u.rt6.rt6i_idev); dst_destroy_metrics_generic(dst); if (xdst->u.rt6.rt6i_uncached_list) rt6_uncached_list_del(&xdst->u.rt6); + if (likely(xdst->u.rt6.rt6i_idev)) + in6_dev_put(xdst->u.rt6.rt6i_idev); xfrm_dst_destroy(xdst); } @@ -289,9 +289,13 @@ int __init xfrm6_init(void) if (ret) goto out_state; - register_pernet_subsys(&xfrm6_net_ops); + ret = register_pernet_subsys(&xfrm6_net_ops); + if (ret) + goto out_protocol; out: return ret; +out_protocol: + xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index bbc1924d64e5..652285191da1 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -2455,7 +2455,7 @@ static int __init afiucv_init(void) { int err; - if (MACHINE_IS_VM) { + if (MACHINE_IS_VM && IS_ENABLED(CONFIG_IUCV)) { cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err); if (unlikely(err)) { WARN_ON(err); @@ -2463,11 +2463,7 @@ static int __init afiucv_init(void) goto out; } - pr_iucv = try_then_request_module(symbol_get(iucv_if), "iucv"); - if (!pr_iucv) { - printk(KERN_WARNING "iucv_if lookup failed\n"); - memset(&iucv_userid, 0, sizeof(iucv_userid)); - } + pr_iucv = &iucv_if; } else { memset(&iucv_userid, 0, sizeof(iucv_userid)); pr_iucv = NULL; @@ -2501,17 +2497,13 @@ out_sock: out_proto: proto_unregister(&iucv_proto); out: - if (pr_iucv) - symbol_put(iucv_if); return err; } static void __exit afiucv_exit(void) { - if (pr_iucv) { + if (pr_iucv) afiucv_iucv_exit(); - symbol_put(iucv_if); - } unregister_netdevice_notifier(&afiucv_netdev_notifier); dev_remove_pack(&iucv_packet_type); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 9a2d023842fe..8b5b8cc93ff8 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -106,7 +106,7 @@ struct iucv_irq_data { u16 ippathid; u8 ipflags1; u8 iptype; - u32 res2[8]; + u32 res2[9]; }; struct iucv_irq_list { @@ -128,7 +128,7 @@ static LIST_HEAD(iucv_task_queue); * The tasklet for fast delivery of iucv interrupts. */ static void iucv_tasklet_fn(unsigned long); -static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0); +static DECLARE_TASKLET_OLD(iucv_tasklet, iucv_tasklet_fn); /* * Queue of interrupt buffers for delivery via a work queue @@ -179,7 +179,7 @@ static char iucv_error_pathid[16] = "INVALID PATHID"; static LIST_HEAD(iucv_handler_list); /* - * iucv_path_table: an array of iucv_path structures. + * iucv_path_table: array of pointers to iucv_path structures. */ static struct iucv_path **iucv_path_table; static unsigned long iucv_max_pathid; @@ -590,7 +590,7 @@ static int iucv_enable(void) get_online_cpus(); rc = -ENOMEM; - alloc_size = iucv_max_pathid * sizeof(struct iucv_path); + alloc_size = iucv_max_pathid * sizeof(*iucv_path_table); iucv_path_table = kzalloc(alloc_size, GFP_KERNEL); if (!iucv_path_table) goto out; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index ea9e73428ed9..920b0ebf1cb8 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -161,7 +161,8 @@ static void kcm_rcv_ready(struct kcm_sock *kcm) /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); - kcm->rx_wait = true; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, true); } static void kcm_rfree(struct sk_buff *skb) @@ -177,7 +178,7 @@ static void kcm_rfree(struct sk_buff *skb) /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); - if (!kcm->rx_wait && !kcm->rx_psock && + if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); @@ -220,7 +221,7 @@ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) struct sk_buff *skb; struct kcm_sock *kcm; - while ((skb = __skb_dequeue(head))) { + while ((skb = skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); @@ -236,7 +237,8 @@ try_again: if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); /* Commit rx_wait to read in kcm_free */ smp_wmb(); @@ -279,10 +281,12 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); psock->rx_kcm = kcm; - kcm->rx_psock = psock; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_psock, psock); spin_unlock_bh(&mux->rx_lock); @@ -309,7 +313,8 @@ static void unreserve_rx_kcm(struct kcm_psock *psock, spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; - kcm->rx_psock = NULL; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_psock, NULL); /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree @@ -1061,15 +1066,18 @@ partial_message: out_error: kcm_push(kcm); - if (copied && sock->type == SOCK_SEQPACKET) { + if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. */ - goto partial_message; - } - - if (head != kcm->seq_skb) + if (copied) + goto partial_message; + if (head != kcm->seq_skb) + kfree_skb(head); + } else { kfree_skb(head); + kcm->seq_skb = NULL; + } err = sk_stream_error(sk, msg->msg_flags, err); @@ -1081,53 +1089,18 @@ out_error: return err; } -static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, - long timeo, int *err) -{ - struct sk_buff *skb; - - while (!(skb = skb_peek(&sk->sk_receive_queue))) { - if (sk->sk_err) { - *err = sock_error(sk); - return NULL; - } - - if (sock_flag(sk, SOCK_DONE)) - return NULL; - - if ((flags & MSG_DONTWAIT) || !timeo) { - *err = -EAGAIN; - return NULL; - } - - sk_wait_data(sk, &timeo, NULL); - - /* Handle signals */ - if (signal_pending(current)) { - *err = sock_intr_errno(timeo); - return NULL; - } - } - - return skb; -} - static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { + int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); int err = 0; - long timeo; struct strp_msg *stm; int copied = 0; struct sk_buff *skb; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - lock_sock(sk); - - skb = kcm_wait_data(sk, flags, timeo, &err); + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; @@ -1158,14 +1131,11 @@ msg_finished: /* Finished with message */ msg->msg_flags |= MSG_EOR; KCM_STATS_INCR(kcm->stats.rx_msgs); - skb_unlink(skb, &sk->sk_receive_queue); - kfree_skb(skb); } } out: - release_sock(sk); - + skb_free_datagram(sk, skb); return copied ? : err; } @@ -1173,9 +1143,9 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); - long timeo; struct strp_msg *stm; int err = 0; ssize_t copied; @@ -1183,11 +1153,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, /* Only support splice for SOCKSEQPACKET */ - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - lock_sock(sk); - - skb = kcm_wait_data(sk, flags, timeo, &err); + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto err_out; @@ -1215,13 +1181,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, * finish reading the message. */ - release_sock(sk); - + skb_free_datagram(sk, skb); return copied; err_out: - release_sock(sk); - + skb_free_datagram(sk, skb); return err; } @@ -1241,7 +1205,8 @@ static void kcm_recv_disable(struct kcm_sock *kcm) if (!kcm->rx_psock) { if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); } requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); @@ -1312,10 +1277,11 @@ static int kcm_getsockopt(struct socket *sock, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; + len = min_t(unsigned int, len, sizeof(int)); + switch (optname) { case KCM_RECV_DISABLE: val = kcm->rx_disabled; @@ -1413,12 +1379,6 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->sk = csk; psock->bpf_prog = prog; - err = strp_init(&psock->strp, csk, &cb); - if (err) { - kmem_cache_free(kcm_psockp, psock); - goto out; - } - write_lock_bh(&csk->sk_callback_lock); /* Check if sk_user_data is aready by KCM or someone else. @@ -1426,13 +1386,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock, */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); - strp_stop(&psock->strp); - strp_done(&psock->strp); kmem_cache_free(kcm_psockp, psock); err = -EALREADY; goto out; } + err = strp_init(&psock->strp, csk, &cb); + if (err) { + write_unlock_bh(&csk->sk_callback_lock); + kmem_cache_free(kcm_psockp, psock); + goto out; + } + psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; @@ -1795,7 +1760,8 @@ static void kcm_done(struct kcm_sock *kcm) if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); } /* Move any pending receive messages to other kcm sockets */ requeue_rx_msgs(mux, &sk->sk_receive_queue); @@ -1840,10 +1806,10 @@ static int kcm_release(struct socket *sock) kcm = kcm_sk(sk); mux = kcm->mux; + lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb); - lock_sock(sk); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return. @@ -2022,6 +1988,8 @@ static __net_exit void kcm_exit_net(struct net *net) * that all multiplexors and psocks have been destroyed. */ WARN_ON(!list_empty(&knet->mux_list)); + + mutex_destroy(&knet->mutex); } static struct pernet_operations kcm_net_ops = { diff --git a/net/key/af_key.c b/net/key/af_key.c index 32fe99cd01fc..ce844919b2eb 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1701,9 +1701,12 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad pfk->registered |= (1<<hdr->sadb_msg_satype); } + mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); + mutex_unlock(&pfkey_mutex); + if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<<hdr->sadb_msg_satype); @@ -1849,9 +1852,9 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; - if ((xfilter->sadb_x_filter_splen >= + if ((xfilter->sadb_x_filter_splen > (sizeof(xfrm_address_t) << 3)) || - (xfilter->sadb_x_filter_dplen >= + (xfilter->sadb_x_filter_dplen > (sizeof(xfrm_address_t) << 3))) { mutex_unlock(&pfk->dump_lock); return -EINVAL; @@ -1941,7 +1944,8 @@ static u32 gen_reqid(struct net *net) } static int -parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) +parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol, + struct sadb_x_ipsecrequest *rq) { struct net *net = xp_net(xp); struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; @@ -1959,9 +1963,12 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; - if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) + if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) { + if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) && + pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND) + return -EINVAL; t->optional = 1; - else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { + } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { t->reqid = rq->sadb_x_ipsecrequest_reqid; if (t->reqid > IPSEC_MANUAL_REQID_MAX) t->reqid = 0; @@ -2003,7 +2010,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) rq->sadb_x_ipsecrequest_len < sizeof(*rq)) return -EINVAL; - if ((err = parse_ipsecrequest(xp, rq)) < 0) + if ((err = parse_ipsecrequest(xp, pol, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); @@ -2906,7 +2913,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2924,7 +2931,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { @@ -2935,16 +2942,17 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } -static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -2972,13 +2980,17 @@ static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } -static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -3020,8 +3032,11 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) @@ -3151,6 +3166,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; + int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) @@ -3162,16 +3178,16 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) - size += count_ah_combs(t); + alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) - size += count_esp_combs(t); + alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } - skb = alloc_skb(size + 16, GFP_ATOMIC); + skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -3225,10 +3241,13 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ + alg_size = 0; if (x->id.proto == IPPROTO_AH) - dump_ah_combs(skb, t); + alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) - dump_esp_combs(skb, t); + alg_size = dump_esp_combs(skb, t); + + hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 421b2c89ce12..d001e254bada 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1171,8 +1171,10 @@ static void l2tp_tunnel_destruct(struct sock *sk) } /* Remove hooks into tunnel socket */ + write_lock_bh(&sk->sk_callback_lock); sk->sk_destruct = tunnel->old_sk_destruct; sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); /* Call the original destructor */ if (sk->sk_destruct) @@ -1491,20 +1493,27 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, sock = sockfd_lookup(tunnel->fd, &ret); if (!sock) goto err; - - ret = l2tp_validate_socket(sock->sk, net, tunnel->encap); - if (ret < 0) - goto err_sock; } + sk = sock->sk; + write_lock_bh(&sk->sk_callback_lock); + ret = l2tp_validate_socket(sk, net, tunnel->encap); + if (ret < 0) + goto err_inval_sock; + rcu_assign_sk_user_data(sk, tunnel); + write_unlock_bh(&sk->sk_callback_lock); + tunnel->l2tp_net = net; pn = l2tp_pernet(net); + sock_hold(sk); + tunnel->sock = sk; + spin_lock_bh(&pn->l2tp_tunnel_list_lock); list_for_each_entry(tunnel_walk, &pn->l2tp_tunnel_list, list) { if (tunnel_walk->tunnel_id == tunnel->tunnel_id) { spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - + sock_put(sk); ret = -EEXIST; goto err_sock; } @@ -1512,10 +1521,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - sk = sock->sk; - sock_hold(sk); - tunnel->sock = sk; - if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { .sk_user_data = tunnel, @@ -1525,8 +1530,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, }; setup_udp_tunnel_sock(net, sock, &udp_cfg); - } else { - sk->sk_user_data = tunnel; } tunnel->old_sk_destruct = sk->sk_destruct; @@ -1541,6 +1544,11 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, return 0; err_sock: + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_sk_user_data(sk, NULL); +err_inval_sock: + write_unlock_bh(&sk->sk_callback_lock); + if (tunnel->fd < 0) sock_release(sock); else diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 23e9ce985ed6..3a24614f37be 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -268,8 +268,6 @@ static void l2tp_ip6_destroy_sock(struct sock *sk) if (tunnel) l2tp_tunnel_delete(tunnel); - - inet6_destroy_sock(sk); } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -523,7 +521,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; - ulen = len + transhdrlen; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) @@ -647,6 +644,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: lock_sock(sk); + ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index c54cb59593ef..7d3c782e5ab1 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1375,11 +1375,11 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - len = min_t(unsigned int, len, sizeof(int)); - if (len < 0) return -EINVAL; + len = min_t(unsigned int, len, sizeof(int)); + err = -ENOTCONN; if (sk->sk_user_data == NULL) goto end; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 3b1ea89a340e..d57bfce94d60 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -227,6 +227,8 @@ static int llc_ui_release(struct socket *sock) if (llc->dev) dev_put(llc->dev); sock_put(sk); + sock_orphan(sk); + sock->sk = NULL; llc_sk_free(sk); out: return 0; @@ -925,14 +927,15 @@ copy_uaddr: */ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { + DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; + int rc = -EINVAL, copied = 0, hdrlen, hh_len; struct sk_buff *skb = NULL; + struct net_device *dev; size_t size = 0; - int rc = -EINVAL, copied = 0, hdrlen; dprintk("%s: sending from %02X to %02X\n", __func__, llc->laddr.lsap, llc->daddr.lsap); @@ -952,22 +955,29 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (rc) goto out; } - hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); + dev = llc->dev; + hh_len = LL_RESERVED_SPACE(dev); + hdrlen = llc_ui_header_len(sk, addr); size = hdrlen + len; - if (size > llc->dev->mtu) - size = llc->dev->mtu; + size = min_t(size_t, size, READ_ONCE(dev->mtu)); copied = size - hdrlen; rc = -EINVAL; if (copied < 0) goto out; release_sock(sk); - skb = sock_alloc_send_skb(sk, size, noblock, &rc); + skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); lock_sock(sk); if (!skb) goto out; - skb->dev = llc->dev; + if (sock_flag(sk, SOCK_ZAPPED) || + llc->dev != dev || + hdrlen != llc_ui_header_len(sk, addr) || + hh_len != LL_RESERVED_SPACE(dev) || + size > READ_ONCE(dev->mtu)) + goto out; + skb->dev = dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); - skb_reserve(skb, hdrlen); + skb_reserve(skb, hh_len + hdrlen); rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 64d4bef04e73..4900a27b5176 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -135,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = { .func = llc_rcv, }; -static struct packet_type llc_tr_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_TR_802_2), - .func = llc_rcv, -}; - static int __init llc_init(void) { dev_add_pack(&llc_packet_type); - dev_add_pack(&llc_tr_packet_type); return 0; } static void __exit llc_exit(void) { dev_remove_pack(&llc_packet_type); - dev_remove_pack(&llc_tr_packet_type); } module_init(llc_init); diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 82cb93f66b9b..f4fb309185ce 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -127,8 +127,14 @@ static inline int llc_fixup_skb(struct sk_buff *skb) skb->transport_header += llc_len; skb_pull(skb, llc_len); if (skb->protocol == htons(ETH_P_802_2)) { - __be16 pdulen = eth_hdr(skb)->h_proto; - s32 data_size = ntohs(pdulen) - llc_len; + __be16 pdulen; + s32 data_size; + + if (skb->mac_len < ETH_HLEN) + return 0; + + pdulen = eth_hdr(skb)->h_proto; + data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) @@ -162,9 +168,6 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); - if (!net_eq(dev_net(dev), &init_net)) - goto drop; - /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index 9fa3342c7a82..df26557a0244 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -153,6 +153,9 @@ int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb) int rc = 1; u32 data_size; + if (skb->mac_len < ETH_HLEN) + return 1; + llc_pdu_decode_sa(skb, mac_da); llc_pdu_decode_da(skb, mac_sa); llc_pdu_decode_ssap(skb, &dsap); diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index c29170e767a8..64e2c67e16ba 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -77,6 +77,9 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb) u32 data_size; struct sk_buff *nskb; + if (skb->mac_len < ETH_HLEN) + goto out; + /* The test request command is type U (llc_len = 3) */ data_size = ntohs(eth_hdr(skb)->h_proto) - 3; nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 16f37fd0ac0e..be8c4338e617 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1670,15 +1670,14 @@ static int ieee80211_change_station(struct wiphy *wiphy, } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && - sta->sdata->u.vlan.sta) { - ieee80211_clear_fast_rx(sta); + sta->sdata->u.vlan.sta) RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); - } if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ieee80211_vif_dec_num_mcast(sta->sdata); sta->sdata = vlansdata; + ieee80211_check_fast_rx(sta); ieee80211_check_fast_xmit(sta); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { @@ -2581,6 +2580,10 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, else *dbm = sdata->vif.bss_conf.txpower; + /* INT_MIN indicates no power level was set yet */ + if (*dbm == INT_MIN) + return -EINVAL; + return 0; } @@ -3280,9 +3283,6 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_MESH_POINT: { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - if (params->chandef.width != sdata->vif.bss_conf.chandef.width) - return -EINVAL; - /* changes into another band are not supported */ if (sdata->vif.bss_conf.chandef.chan->band != params->chandef.chan->band) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 0e26c83b6b41..d5b8568591d4 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -542,6 +542,10 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) sdata_assert_lock(sdata); + /* When not connected/joined, sending CSA doesn't make sense. */ + if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) + return -ENOLINK; + /* update cfg80211 bss information with the new channel */ if (!is_zero_ether_addr(ifibss->bssid)) { cbss = cfg80211_get_bss(sdata->local->hw.wiphy, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 7747a6f46d29..3b7151501b3e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -632,6 +632,26 @@ struct mesh_csa_settings { struct cfg80211_csa_settings settings; }; +/** + * struct mesh_table + * + * @known_gates: list of known mesh gates and their mpaths by the station. The + * gate's mpath may or may not be resolved and active. + * @gates_lock: protects updates to known_gates + * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr + * @walk_head: linked list containing all mesh_path objects + * @walk_lock: lock protecting walk_head + * @entries: number of entries in the table + */ +struct mesh_table { + struct hlist_head known_gates; + spinlock_t gates_lock; + struct rhashtable rhead; + struct hlist_head walk_head; + spinlock_t walk_lock; + atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ +}; + struct ieee80211_if_mesh { struct timer_list housekeeping_timer; struct timer_list mesh_path_timer; @@ -706,8 +726,8 @@ struct ieee80211_if_mesh { /* offset from skb->data while building IE */ int meshconf_offset; - struct mesh_table *mesh_paths; - struct mesh_table *mpp_paths; /* Store paths for MPP&MAP */ + struct mesh_table mesh_paths; + struct mesh_table mpp_paths; /* Store paths for MPP&MAP */ int mesh_paths_generation; int mpp_paths_generation; }; @@ -1460,7 +1480,6 @@ struct ieee802_11_elems { const u8 *supp_rates; const u8 *ds_params; const struct ieee80211_tim_ie *tim; - const u8 *challenge; const u8 *rsn; const u8 *erp_info; const u8 *ext_supp_rates; @@ -1507,7 +1526,6 @@ struct ieee802_11_elems { u8 ssid_len; u8 supp_rates_len; u8 tim_len; - u8 challenge_len; u8 rsn_len; u8 ext_supp_rates_len; u8 wmm_info_len; @@ -1521,6 +1539,8 @@ struct ieee802_11_elems { u8 country_elem_len; u8 bssid_index_len; + void *nontx_profile; + /* whether a parse error occurred while retrieving these elements */ bool parse_error; }; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index f215218a88c9..fa2ac02063cf 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1315,8 +1315,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_led_exit(local); destroy_workqueue(local->workqueue); fail_workqueue: - if (local->wiphy_ciphers_allocated) + if (local->wiphy_ciphers_allocated) { kfree(local->hw.wiphy->cipher_suites); + local->wiphy_ciphers_allocated = false; + } kfree(local->int_scan_req); return result; } @@ -1386,8 +1388,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) mutex_destroy(&local->iflist_mtx); mutex_destroy(&local->mtx); - if (local->wiphy_ciphers_allocated) + if (local->wiphy_ciphers_allocated) { kfree(local->hw.wiphy->cipher_suites); + local->wiphy_ciphers_allocated = false; + } idr_for_each(&local->ack_status_frames, ieee80211_free_ack_frame, NULL); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 953f720754e8..3a610ade2c04 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -127,26 +127,6 @@ struct mesh_path { u32 path_change_count; }; -/** - * struct mesh_table - * - * @known_gates: list of known mesh gates and their mpaths by the station. The - * gate's mpath may or may not be resolved and active. - * @gates_lock: protects updates to known_gates - * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr - * @walk_head: linked list containging all mesh_path objects - * @walk_lock: lock protecting walk_head - * @entries: number of entries in the table - */ -struct mesh_table { - struct hlist_head known_gates; - spinlock_t gates_lock; - struct rhashtable rhead; - struct hlist_head walk_head; - spinlock_t walk_lock; - atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ -}; - /* Recent multicast cache */ /* RMC_BUCKETS must be a power of 2, maximum 256 */ #define RMC_BUCKETS 256 @@ -306,7 +286,7 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta); void mesh_path_flush_pending(struct mesh_path *mpath); void mesh_path_tx_pending(struct mesh_path *mpath); -int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata); +void mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata); void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata); int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr); void mesh_path_timer(struct timer_list *t); diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index d7ae7415d54d..7e27e5201c54 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -47,32 +47,24 @@ static void mesh_path_rht_free(void *ptr, void *tblptr) mesh_path_free_rcu(tbl, mpath); } -static struct mesh_table *mesh_table_alloc(void) +static void mesh_table_init(struct mesh_table *tbl) { - struct mesh_table *newtbl; + INIT_HLIST_HEAD(&tbl->known_gates); + INIT_HLIST_HEAD(&tbl->walk_head); + atomic_set(&tbl->entries, 0); + spin_lock_init(&tbl->gates_lock); + spin_lock_init(&tbl->walk_lock); - newtbl = kmalloc(sizeof(struct mesh_table), GFP_ATOMIC); - if (!newtbl) - return NULL; - - INIT_HLIST_HEAD(&newtbl->known_gates); - INIT_HLIST_HEAD(&newtbl->walk_head); - atomic_set(&newtbl->entries, 0); - spin_lock_init(&newtbl->gates_lock); - spin_lock_init(&newtbl->walk_lock); - if (rhashtable_init(&newtbl->rhead, &mesh_rht_params)) { - kfree(newtbl); - return NULL; - } - - return newtbl; + /* rhashtable_init() may fail only in case of wrong + * mesh_rht_params + */ + WARN_ON(rhashtable_init(&tbl->rhead, &mesh_rht_params)); } static void mesh_table_free(struct mesh_table *tbl) { rhashtable_free_and_destroy(&tbl->rhead, mesh_path_rht_free, tbl); - kfree(tbl); } /** @@ -240,13 +232,13 @@ static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst, struct mesh_path * mesh_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) { - return mpath_lookup(sdata->u.mesh.mesh_paths, dst, sdata); + return mpath_lookup(&sdata->u.mesh.mesh_paths, dst, sdata); } struct mesh_path * mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) { - return mpath_lookup(sdata->u.mesh.mpp_paths, dst, sdata); + return mpath_lookup(&sdata->u.mesh.mpp_paths, dst, sdata); } static struct mesh_path * @@ -283,7 +275,7 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) struct mesh_path * mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { - return __mesh_path_lookup_by_idx(sdata->u.mesh.mesh_paths, idx); + return __mesh_path_lookup_by_idx(&sdata->u.mesh.mesh_paths, idx); } /** @@ -298,7 +290,7 @@ mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) struct mesh_path * mpp_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { - return __mesh_path_lookup_by_idx(sdata->u.mesh.mpp_paths, idx); + return __mesh_path_lookup_by_idx(&sdata->u.mesh.mpp_paths, idx); } /** @@ -311,7 +303,7 @@ int mesh_path_add_gate(struct mesh_path *mpath) int err; rcu_read_lock(); - tbl = mpath->sdata->u.mesh.mesh_paths; + tbl = &mpath->sdata->u.mesh.mesh_paths; spin_lock_bh(&mpath->state_lock); if (mpath->is_gate) { @@ -420,7 +412,7 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, if (!new_mpath) return ERR_PTR(-ENOMEM); - tbl = sdata->u.mesh.mesh_paths; + tbl = &sdata->u.mesh.mesh_paths; spin_lock_bh(&tbl->walk_lock); mpath = rhashtable_lookup_get_insert_fast(&tbl->rhead, &new_mpath->rhash, @@ -462,7 +454,7 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, return -ENOMEM; memcpy(new_mpath->mpp, mpp, ETH_ALEN); - tbl = sdata->u.mesh.mpp_paths; + tbl = &sdata->u.mesh.mpp_paths; spin_lock_bh(&tbl->walk_lock); ret = rhashtable_lookup_insert_fast(&tbl->rhead, @@ -491,7 +483,7 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, void mesh_plink_broken(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - struct mesh_table *tbl = sdata->u.mesh.mesh_paths; + struct mesh_table *tbl = &sdata->u.mesh.mesh_paths; static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct mesh_path *mpath; @@ -550,7 +542,7 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mesh_path *mpath) void mesh_path_flush_by_nexthop(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - struct mesh_table *tbl = sdata->u.mesh.mesh_paths; + struct mesh_table *tbl = &sdata->u.mesh.mesh_paths; struct mesh_path *mpath; struct hlist_node *n; @@ -565,7 +557,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, const u8 *proxy) { - struct mesh_table *tbl = sdata->u.mesh.mpp_paths; + struct mesh_table *tbl = &sdata->u.mesh.mpp_paths; struct mesh_path *mpath; struct hlist_node *n; @@ -599,8 +591,8 @@ static void table_flush_by_iface(struct mesh_table *tbl) */ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) { - table_flush_by_iface(sdata->u.mesh.mesh_paths); - table_flush_by_iface(sdata->u.mesh.mpp_paths); + table_flush_by_iface(&sdata->u.mesh.mesh_paths); + table_flush_by_iface(&sdata->u.mesh.mpp_paths); } /** @@ -646,7 +638,7 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) /* flush relevant mpp entries first */ mpp_flush_by_proxy(sdata, addr); - err = table_path_del(sdata->u.mesh.mesh_paths, sdata, addr); + err = table_path_del(&sdata->u.mesh.mesh_paths, sdata, addr); sdata->u.mesh.mesh_paths_generation++; return err; } @@ -684,7 +676,7 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) struct mesh_path *gate; bool copy = false; - tbl = sdata->u.mesh.mesh_paths; + tbl = &sdata->u.mesh.mesh_paths; rcu_read_lock(); hlist_for_each_entry_rcu(gate, &tbl->known_gates, gate_list) { @@ -720,7 +712,7 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - kfree_skb(skb); + ieee80211_free_txskb(&sdata->local->hw, skb); sdata->u.mesh.mshstats.dropped_frames_no_route++; } @@ -764,29 +756,10 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) mesh_path_tx_pending(mpath); } -int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata) +void mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata) { - struct mesh_table *tbl_path, *tbl_mpp; - int ret; - - tbl_path = mesh_table_alloc(); - if (!tbl_path) - return -ENOMEM; - - tbl_mpp = mesh_table_alloc(); - if (!tbl_mpp) { - ret = -ENOMEM; - goto free_path; - } - - sdata->u.mesh.mesh_paths = tbl_path; - sdata->u.mesh.mpp_paths = tbl_mpp; - - return 0; - -free_path: - mesh_table_free(tbl_path); - return ret; + mesh_table_init(&sdata->u.mesh.mesh_paths); + mesh_table_init(&sdata->u.mesh.mpp_paths); } static @@ -808,12 +781,12 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata, void mesh_path_expire(struct ieee80211_sub_if_data *sdata) { - mesh_path_tbl_expire(sdata, sdata->u.mesh.mesh_paths); - mesh_path_tbl_expire(sdata, sdata->u.mesh.mpp_paths); + mesh_path_tbl_expire(sdata, &sdata->u.mesh.mesh_paths); + mesh_path_tbl_expire(sdata, &sdata->u.mesh.mpp_paths); } void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata) { - mesh_table_free(sdata->u.mesh.mesh_paths); - mesh_table_free(sdata->u.mesh.mpp_paths); + mesh_table_free(&sdata->u.mesh.mesh_paths); + mesh_table_free(&sdata->u.mesh.mpp_paths); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 737c5f4dbf52..def34c843f29 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1044,8 +1044,8 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, case WLAN_SP_MESH_PEERING_OPEN: if (!matches_local) event = OPN_RJCT; - if (!mesh_plink_free_count(sdata) || - (sta->mesh->plid && sta->mesh->plid != plid)) + else if (!mesh_plink_free_count(sdata) || + (sta->mesh->plid && sta->mesh->plid != plid)) event = OPN_IGNR; else event = OPN_ACPT; @@ -1053,9 +1053,9 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, case WLAN_SP_MESH_PEERING_CONFIRM: if (!matches_local) event = CNF_RJCT; - if (!mesh_plink_free_count(sdata) || - sta->mesh->llid != llid || - (sta->mesh->plid && sta->mesh->plid != plid)) + else if (!mesh_plink_free_count(sdata) || + sta->mesh->llid != llid || + (sta->mesh->plid && sta->mesh->plid != plid)) event = CNF_IGNR; else event = CNF_ACPT; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5415e566e09d..b48a09043663 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2829,14 +2829,14 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; + const struct element *challenge; u8 *pos; - struct ieee802_11_elems elems; u32 tx_flags = 0; pos = mgmt->u.auth.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, auth_data->bss->bssid); - if (!elems.challenge) + challenge = cfg80211_find_elem(WLAN_EID_CHALLENGE, pos, + len - (pos - (u8 *)mgmt)); + if (!challenge) return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata, 0); @@ -2844,7 +2844,8 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, - elems.challenge - 2, elems.challenge_len + 2, + (void *)challenge, + challenge->datalen + sizeof(*challenge), auth_data->bss->bssid, auth_data->bss->bssid, auth_data->key, auth_data->key_len, auth_data->key_idx, tx_flags); @@ -3223,7 +3224,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, pos = mgmt->u.assoc_resp.variable; ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, assoc_data->bss->bssid); + mgmt->bssid, NULL); if (!elems.supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); @@ -3298,6 +3299,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } + kfree(bss_elems.nontx_profile); } /* @@ -3575,7 +3577,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, pos = mgmt->u.assoc_resp.variable; ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, assoc_data->bss->bssid); + mgmt->bssid, NULL); if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && elems.timeout_int && @@ -3882,6 +3884,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ifmgd->assoc_data->timeout = jiffies; ifmgd->assoc_data->timeout_started = true; run_again(sdata, ifmgd->assoc_data->timeout); + kfree(elems.nontx_profile); return; } @@ -4049,7 +4052,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, deauth_buf, sizeof(deauth_buf), true, WLAN_REASON_DEAUTH_LEAVING); - return; + goto free; } if (sta && elems.opmode_notif) @@ -4064,6 +4067,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, elems.cisco_dtpc_elem); ieee80211_bss_info_change_notify(sdata, changed); +free: + kfree(elems.nontx_profile); } void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 344b2c22e75b..ee65f1f50a0a 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -216,6 +216,8 @@ ieee80211_bss_info_update(struct ieee80211_local *local, rx_status, beacon); } + kfree(elems.nontx_profile); + return bss; } @@ -431,10 +433,6 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); - if (scan_req != local->int_scan_req) { - local->scan_info.aborted = aborted; - cfg80211_scan_done(scan_req, &local->scan_info); - } RCU_INIT_POINTER(local->scan_req, NULL); scan_sdata = rcu_dereference_protected(local->scan_sdata, @@ -444,6 +442,13 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) local->scanning = 0; local->scan_chandef.chan = NULL; + synchronize_rcu(); + + if (scan_req != local->int_scan_req) { + local->scan_info.aborted = aborted; + cfg80211_scan_done(scan_req, &local->scan_info); + } + /* Set power back to normal operating levels. */ ieee80211_hw_config(local, 0); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 7b2e8c890381..e330036e02ea 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -683,6 +683,8 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); + ieee80211_check_fast_xmit(sta); + return 0; out_remove: sta_info_hash_del(local, sta); @@ -1023,7 +1025,8 @@ static int __must_check __sta_info_destroy_part1(struct sta_info *sta) list_del_rcu(&sta->list); sta->removed = true; - drv_sta_pre_rcu_remove(local, sta->sdata, sta); + if (sta->uploaded) + drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) @@ -2121,7 +2124,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { - u16 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); + u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d82d22b6a2a9..5fd9a6f752a1 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -651,7 +651,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) } if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED && - !ieee80211_is_deauth(hdr->frame_control))) + !ieee80211_is_deauth(hdr->frame_control)) && + tx->skb->protocol != tx->sdata->control_port_protocol) return TX_DROP; if (!skip_hw && tx->key && @@ -2918,7 +2919,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) sdata->vif.type == NL80211_IFTYPE_STATION) goto out; - if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->uploaded) goto out; if (test_sta_flag(sta, WLAN_STA_PS_STA) || diff --git a/net/mac80211/util.c b/net/mac80211/util.c index c1c117fdf318..6223af1c3457 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1006,10 +1006,6 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, } else elem_parse_failed = true; break; - case WLAN_EID_CHALLENGE: - elems->challenge = pos; - elems->challenge_len = elen; - break; case WLAN_EID_VENDOR_SPECIFIC: if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && pos[2] == 0xf2) { @@ -1289,6 +1285,8 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { if (elem->datalen < 2) continue; + if (elem->data[0] < 1 || elem->data[0] > 8) + continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 new_bssid[ETH_ALEN]; @@ -1365,6 +1363,11 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, nontransmitted_profile, nontransmitted_profile_len); + if (!nontransmitted_profile_len) { + nontransmitted_profile_len = 0; + kfree(nontransmitted_profile); + nontransmitted_profile = NULL; + } } crc = _ieee802_11_parse_elems_crc(start, len, action, elems, filter, @@ -1394,7 +1397,7 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, offsetofend(struct ieee80211_bssid_index, dtim_count)) elems->dtim_count = elems->bssid_index->dtim_count; - kfree(nontransmitted_profile); + elems->nontx_profile = nontransmitted_profile; return crc; } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index ace44ff96635..d6e58506136b 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -141,12 +141,14 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { + const struct ethhdr *eth = (void *)skb->data; struct mac80211_qos_map *qos_map; bool qos; /* all mesh/ocb stations are required to support WME */ - if (sta && (sdata->vif.type == NL80211_IFTYPE_MESH_POINT || - sdata->vif.type == NL80211_IFTYPE_OCB)) + if ((sdata->vif.type == NL80211_IFTYPE_MESH_POINT && + !is_multicast_ether_addr(eth->h_dest)) || + (sdata->vif.type == NL80211_IFTYPE_OCB && sta)) qos = true; else if (sta) qos = sta->sta.wme; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 1cf5ac09edcb..a08240fe68a7 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -661,6 +661,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, sdata->dev = ndev; sdata->wpan_dev.wpan_phy = local->hw.phy; sdata->local = local; + INIT_LIST_HEAD(&sdata->wpan_dev.list); /* setup type-dependent data */ ret = ieee802154_setup_sdata(sdata, type); diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index b8ce84618a55..726b47a4611b 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -44,7 +44,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: - if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE) + if (hdr->source.mode != IEEE802154_ADDR_NONE) /* FIXME: check if we are PAN coordinator */ skb->pkt_type = PACKET_OTHERHOST; else @@ -132,7 +132,7 @@ static int ieee802154_parse_frame_start(struct sk_buff *skb, struct ieee802154_hdr *hdr) { int hlen; - struct ieee802154_mac_cb *cb = mac_cb_init(skb); + struct ieee802154_mac_cb *cb = mac_cb(skb); skb_reset_mac_header(skb); @@ -294,8 +294,9 @@ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) { struct ieee802154_local *local = hw_to_local(hw); + struct ieee802154_mac_cb *cb = mac_cb_init(skb); - mac_cb(skb)->lqi = lqi; + cb->lqi = lqi; skb->pkt_type = IEEE802154_RX_MSG; skb_queue_tail(&local->skb_queue, skb); tasklet_schedule(&local->tasklet); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index d5e3656fc67c..3a55a392e021 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1428,6 +1428,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, free: kfree(table); out: + mdev->sysctl = NULL; return -ENOBUFS; } @@ -1437,6 +1438,9 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct net *net = dev_net(dev); struct ctl_table *table; + if (!mdev->sysctl) + return; + table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index ad3fd7f1da75..1dde6dc841b8 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -81,9 +81,12 @@ enum { struct ncsi_channel_version { - u32 version; /* Supported BCD encoded NCSI version */ - u32 alpha2; /* Supported BCD encoded NCSI version */ - u8 fw_name[12]; /* Firware name string */ + u8 major; /* NCSI version major */ + u8 minor; /* NCSI version minor */ + u8 update; /* NCSI version update */ + char alpha1; /* NCSI version alpha1 */ + char alpha2; /* NCSI version alpha2 */ + u8 fw_name[12]; /* Firmware name string */ u32 fw_version; /* Firmware version */ u16 pci_ids[4]; /* PCI identification */ u32 mf_id; /* Manufacture ID */ diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index b635c194f0a8..62fb1031763d 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -165,6 +165,7 @@ static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, nc->state = NCSI_CHANNEL_INACTIVE; list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); + nc->modes[NCSI_MODE_TX_ENABLE].enable = 0; return ncsi_process_next_channel(ndp); } diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 9bd12f7517ed..6710f6b8764b 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -770,9 +770,6 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) return -1; } - /* Set the flag for GMA command which should only be called once */ - nca->ndp->gma_flag = 1; - /* Get Mac address from NCSI device */ return nch->handler(nca); } diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 27700887c321..feb0b422d193 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -71,8 +71,8 @@ static int ncsi_write_channel_info(struct sk_buff *skb, if (nc == nc->package->preferred_channel) nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); - nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version); - nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.alpha2); + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.major); + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.minor); nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name); vid_nest = nla_nest_start_noflag(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 80938b338fee..3fbea7e74fb1 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -191,9 +191,12 @@ struct ncsi_rsp_gls_pkt { /* Get Version ID */ struct ncsi_rsp_gvi_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ - __be32 ncsi_version; /* NCSI version */ + unsigned char major; /* NCSI version major */ + unsigned char minor; /* NCSI version minor */ + unsigned char update; /* NCSI version update */ + unsigned char alpha1; /* NCSI version alpha1 */ unsigned char reserved[3]; /* Reserved */ - unsigned char alpha2; /* NCSI version */ + unsigned char alpha2; /* NCSI version alpha2 */ unsigned char fw_name[12]; /* f/w name string */ __be32 fw_version; /* f/w version */ __be16 pci_ids[4]; /* PCI IDs */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 7c893c379920..876622e9a5b2 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -19,6 +19,19 @@ #include "ncsi-pkt.h" #include "ncsi-netlink.h" +/* Nibbles within [0xA, 0xF] add zero "0" to the returned value. + * Optional fields (encoded as 0xFF) will default to zero. + */ +static u8 decode_bcd_u8(u8 x) +{ + int lo = x & 0xF; + int hi = x >> 4; + + lo = lo < 0xA ? lo : 0; + hi = hi < 0xA ? hi : 0; + return lo + hi * 10; +} + static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, unsigned short payload) { @@ -627,6 +640,9 @@ static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); @@ -671,6 +687,9 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) return -ENXIO; + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); @@ -749,9 +768,18 @@ static int ncsi_rsp_handler_gvi(struct ncsi_request *nr) if (!nc) return -ENODEV; - /* Update to channel's version info */ + /* Update channel's version info + * + * Major, minor, and update fields are supposed to be + * unsigned integers encoded as packed BCD. + * + * Alpha1 and alpha2 are ISO/IEC 8859-1 characters. + */ ncv = &nc->version; - ncv->version = ntohl(rsp->ncsi_version); + ncv->major = decode_bcd_u8(rsp->major); + ncv->minor = decode_bcd_u8(rsp->minor); + ncv->update = decode_bcd_u8(rsp->update); + ncv->alpha1 = rsp->alpha1; ncv->alpha2 = rsp->alpha2; memcpy(ncv->fw_name, rsp->fw_name, 12); ncv->fw_version = ntohl(rsp->fw_version); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ef72819d9d31..d569915da003 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -118,7 +118,6 @@ config NF_CONNTRACK_ZONES config NF_CONNTRACK_PROCFS bool "Supply CT list in procfs (OBSOLETE)" - default y depends on PROC_FS ---help--- This option enables for the list of known conntrack entries diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 451b2df998ea..7c38b8fdc7e0 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -290,12 +290,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum)) return NULL; return net->nf.hooks_ipv6 + hooknum; -#if IS_ENABLED(CONFIG_DECNET) - case NFPROTO_DECNET: - if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum)) - return NULL; - return net->nf.hooks_decnet + hooknum; -#endif default: WARN_ON_ONCE(1); return NULL; @@ -577,9 +571,11 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - BUG_ON(ct_hook == NULL); - ct_hook->destroy(nfct); + if (ct_hook) + ct_hook->destroy(nfct); rcu_read_unlock(); + + WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); @@ -625,10 +621,6 @@ static int __net_init netfilter_net_init(struct net *net) #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); #endif -#if IS_ENABLED(CONFIG_DECNET) - __netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet)); -#endif - #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index bfd4b42ba305..288675ce22d0 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -28,6 +28,7 @@ #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) @@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; - if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); - if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); @@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t) add_timer(&map->gc); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); +} + static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, @@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = { .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 6e3cf4d19ce8..194d775ce4cc 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -296,8 +296,8 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask); - hosts = 2 << (32 - netmask - 1); - elements = 2 << (netmask - mask_bits - 1); + hosts = 2U << (32 - netmask - 1); + elements = 2UL << (netmask - mask_bits - 1); } if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 16ae770f049d..544106475d4f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -61,6 +61,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); ip_set_dereference((inst)->ip_set_list)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] +#define ip_set_dereference_nfnl(p) \ + rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -530,6 +532,14 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ +static void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + static inline void __ip_set_put_netlink(struct ip_set *set) { @@ -548,15 +558,10 @@ __ip_set_put_netlink(struct ip_set *set) static inline struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { - struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); - rcu_read_lock(); - /* ip_set_list itself needs to be protected */ - set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); - - return set; + /* ip_set_list and the set pointer need to be protected */ + return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void @@ -811,20 +816,9 @@ static struct nlmsghdr * start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - - nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), - sizeof(*nfmsg), flags); - if (!nlh) - return NULL; - - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_IPV4; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - return nlh; + return nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags, + NFPROTO_IPV4, NFNETLINK_V0, 0); } /* Create a set */ @@ -1012,6 +1006,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, return ret; cleanup: + set->variant->cancel_gc(set); set->variant->destroy(set); put_out: module_put(set->type->me); @@ -1040,6 +1035,14 @@ ip_set_destroy_set(struct ip_set *set) kfree(set); } +static void +ip_set_destroy_set_rcu(struct rcu_head *head) +{ + struct ip_set *set = container_of(head, struct ip_set, rcu); + + ip_set_destroy_set(set); +} + static int ip_set_destroy(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[], @@ -1053,8 +1056,6 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); /* Commands are serialized and references are * protected by the ip_set_ref_lock. @@ -1066,8 +1067,10 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, * counter, so if it's already zero, we can proceed * without holding the lock. */ - read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { + /* Must wait for flush to be really finished in list:set */ + rcu_barrier(); + read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { @@ -1081,12 +1084,17 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, s = ip_set(inst, i); if (s) { ip_set(inst, i) = NULL; + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); ip_set_destroy_set(s); } } /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { + u16 features = 0; + + read_lock_bh(&ip_set_ref_lock); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { @@ -1096,10 +1104,16 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, ret = -IPSET_ERR_BUSY; goto out; } + features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - - ip_set_destroy_set(s); + if (features & IPSET_TYPE_NAME) { + /* Must wait for flush to be really finished */ + rcu_barrier(); + } + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); + call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; out: @@ -1539,6 +1553,14 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { + if (retried) { + __ip_set_get_netlink(set); + nfnl_unlock(NFNL_SUBSYS_IPSET); + cond_resched(); + nfnl_lock(NFNL_SUBSYS_IPSET); + __ip_set_put_netlink(set); + } + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); @@ -2215,6 +2237,7 @@ ip_set_net_exit(struct net *net) set = ip_set(inst, i); if (set) { ip_set(inst, i) = NULL; + set->variant->cancel_gc(set); ip_set_destroy_set(set); } } @@ -2262,8 +2285,11 @@ ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - unregister_pernet_subsys(&ip_set_net_ops); + + /* Wait for call_rcu() in destroy */ + rcu_barrier(); + pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 1b44dfa7ba85..4346cae25a4a 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -235,6 +235,7 @@ htable_size(u8 hbits) #undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init +#undef mtype_cancel_gc #undef mtype_variant #undef mtype_data_match @@ -279,6 +280,7 @@ htable_size(u8 hbits) #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) @@ -444,7 +446,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference(hbucket(t, i)); + n = (__force struct hbucket *)hbucket(t, i); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -464,10 +466,7 @@ mtype_destroy(struct ip_set *set) struct htype *h = set->data; struct list_head *l, *lt; - if (SET_WITH_TIMEOUT(set)) - cancel_delayed_work_sync(&h->gc.dwork); - - mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + mtype_ahash_destroy(set, (__force struct htable *)h->table, true); list_for_each_safe(l, lt, &h->ad) { list_del(l); kfree(l); @@ -613,6 +612,15 @@ mtype_gc_init(struct htable_gc *gc) queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct htype *h = set->data; + + if (SET_WITH_TIMEOUT(set)) + cancel_delayed_work_sync(&h->gc.dwork); +} + static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags); @@ -1433,6 +1441,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, .region_lock = true, }; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index a82b70e8b9a6..aed319815358 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -35,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:net,port,net"); #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS #define IPSET_NET_COUNT 2 +#define IP_SET_HASH_WITH_NET0 /* IPv4 variant */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 63908123f7ba..64cc3e2131f3 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set) struct list_set *map = set->data; struct set_elem *e, *n; - if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); - list_for_each_entry_safe(e, n, &map->members, list) { list_del(&e->list); ip_set_put_byindex(map->net, e->id); @@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } +static void +list_set_cancel_gc(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); +} + static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, @@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = { .head = list_set_head, .list = list_set_list, .same_set = list_set_same_set, + .cancel_gc = list_set_cancel_gc, }; static void diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index f9b16f2b2219..fdacbc3c15be 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -599,13 +599,19 @@ static const struct seq_operations ip_vs_app_seq_ops = { int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->app_list); - proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops, - sizeof(struct seq_net_private)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, + &ip_vs_app_seq_ops, + sizeof(struct seq_net_private))) + return -ENOMEM; +#endif return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { unregister_ip_vs_app(ipvs, NULL /* all */); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_app", ipvs->net->proc_net); +#endif } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a189079a6ea5..d66548d2e5de 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1225,8 +1225,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp) * The drop rate array needs tuning for real environments. * Called from timer bh only => no locking */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; + static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static signed char todrop_counter[9] = {0}; int i; /* if the conn entry hasn't lasted for 60 seconds, don't drop it. @@ -1373,20 +1373,36 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { atomic_set(&ipvs->conn_count, 0); - proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, - &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state)); - proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, - &ip_vs_conn_sync_seq_ops, - sizeof(struct ip_vs_iter_state)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, + &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn; + + if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, + &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn_sync; +#endif + return 0; + +#ifdef CONFIG_PROC_FS +err_conn_sync: + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); +err_conn: + return -ENOMEM; +#endif } void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) { /* flush all the connection entries first */ ip_vs_conn_flush(ipvs); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); +#endif } int __init ip_vs_conn_init(void) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 07242503d74d..2bc82dabfe3b 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1759,6 +1759,7 @@ static int proc_do_sync_threshold(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; @@ -1768,6 +1769,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, .mode = table->mode, }; + mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { @@ -1777,6 +1779,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, else memcpy(valp, val, sizeof(val)); } + mutex_unlock(&ipvs->sync_mutex); return rc; } @@ -4034,6 +4037,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index a0921adc31a9..1e689c714127 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 0c1bc654245c..fb1dc205e3b5 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1444,7 +1444,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1510,8 +1510,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - salen, 0); + result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1551,7 +1551,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index cefc39878b1a..5c81772158d8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -271,7 +271,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } @@ -286,7 +286,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, { if (ip_hdr(skb)->ttl <= 1) { /* Tell the sender its packet died... */ - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); return false; } @@ -1231,6 +1231,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1379,6 +1380,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d9b6f2001d00..fb1f12bb0ff2 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1943,6 +1943,9 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, return 0; helper = rcu_dereference(help->helper); + if (!helper) + return 0; + if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) return 0; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 9eca90414bb7..b22801f97bce 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -382,7 +382,7 @@ static int help(struct sk_buff *skb, int ret; u32 seq; int dir = CTINFO2DIR(ctinfo); - unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); + unsigned int matchlen, matchoff; struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; union nf_inet_addr *daddr; diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 573cb4481481..814857ae3b81 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -533,6 +533,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Get fields bitmap */ if (nf_h323_error_boundary(bs, 0, f->sz)) return H323_ERROR_BOUND; + if (f->sz > 32) + return H323_ERROR_RANGE; bmp = get_bitmap(bs, f->sz); if (base) *(unsigned int *)base = bmp; @@ -589,6 +591,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, bmp2_len = get_bits(bs, 7) + 1; if (nf_h323_error_boundary(bs, 0, bmp2_len)) return H323_ERROR_BOUND; + if (bmp2_len > 32) + return H323_ERROR_RANGE; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 118f415928ae..32cc91f5ba99 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -404,6 +404,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + if (!nf_ct_helper_hash) + return -ENOENT; + if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; @@ -587,4 +590,5 @@ void nf_conntrack_helper_fini(void) { nf_ct_extend_unregister(&helper_extend); kvfree(nf_ct_helper_hash); + nf_ct_helper_hash = NULL; } diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index e40988a2f22f..65b5b05fe38d 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -148,15 +148,37 @@ static int help(struct sk_buff *skb, unsigned int protoff, data = ib_ptr; data_limit = ib_ptr + skb->len - dataoff; - /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 - * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ - while (data < data_limit - (19 + MINMATCHLEN)) { - if (memcmp(data, "\1DCC ", 5)) { + /* Skip any whitespace */ + while (data < data_limit - 10) { + if (*data == ' ' || *data == '\r' || *data == '\n') + data++; + else + break; + } + + /* strlen("PRIVMSG x ")=10 */ + if (data < data_limit - 10) { + if (strncasecmp("PRIVMSG ", data, 8)) + goto out; + data += 8; + } + + /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26 + * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26 + */ + while (data < data_limit - (21 + MINMATCHLEN)) { + /* Find first " :", the start of message */ + if (memcmp(data, " :", 2)) { data++; continue; } + data += 2; + + /* then check that place only for the DCC command */ + if (memcmp(data, "\1DCC ", 5)) + goto out; data += 5; - /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */ iph = ip_hdr(skb); pr_debug("DCC found in master %pI4:%u %pI4:%u\n", @@ -172,7 +194,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, pr_debug("DCC %s detected\n", dccprotos[i]); /* we have at least - * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid * data left (== 14/13 bytes) */ if (parse_dcc(data, data_limit, &dcc_ip, &dcc_port, &addr_beg_p, &addr_end_p)) { @@ -185,8 +207,9 @@ static int help(struct sk_buff *skb, unsigned int protoff, /* dcc_ip can be the internal OR external (NAT'ed) IP */ tuple = &ct->tuplehash[dir].tuple; - if (tuple->src.u3.ip != dcc_ip && - tuple->dst.u3.ip != dcc_ip) { + if ((tuple->src.u3.ip != dcc_ip && + ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) || + dcc_port == 0) { net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", &tuple->src.u3.ip, &dcc_ip, dcc_port); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bc6f0c8874f8..45d02185f4b9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -515,20 +515,15 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -685,7 +680,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; struct nf_conn *ct = item->ct; struct sk_buff *skb; @@ -715,15 +709,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -1229,9 +1219,6 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) { - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) - return 0; - return ctnetlink_filter_match(ct, data); } @@ -1294,11 +1281,6 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, ct = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) { - nf_ct_put(ct); - return -EBUSY; - } - if (cda[CTA_ID]) { __be32 id = nla_get_be32(cda[CTA_ID]); @@ -2086,12 +2068,15 @@ ctnetlink_create_conntrack(struct net *net, err = nf_conntrack_hash_check_insert(ct); if (err < 0) - goto err2; + goto err3; rcu_read_unlock(); return ct; +err3: + if (ct->master) + nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: @@ -2205,20 +2190,15 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, __u16 cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || @@ -2289,20 +2269,15 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; unsigned int nr_conntracks = atomic_read(&net->ct.count); event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; @@ -2709,7 +2684,9 @@ nla_put_failure: return -1; } +#if IS_ENABLED(CONFIG_NF_NAT) static const union nf_inet_addr any_addr; +#endif static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) { @@ -2806,19 +2783,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -2838,7 +2810,6 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *skb; unsigned int type, group; int flags = 0; @@ -2861,15 +2832,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -3209,10 +3176,12 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, return 0; } +#if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; +#endif static int ctnetlink_parse_expect_nat(const struct nlattr *attr, @@ -3437,20 +3406,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_EXP_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b3f4a334f9d7..67b8dedef293 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -430,9 +430,19 @@ static bool dccp_error(const struct dccp_hdr *dh, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { + static const unsigned long require_seq48 = 1 << DCCP_PKT_REQUEST | + 1 << DCCP_PKT_RESPONSE | + 1 << DCCP_PKT_CLOSEREQ | + 1 << DCCP_PKT_CLOSE | + 1 << DCCP_PKT_RESET | + 1 << DCCP_PKT_SYNC | + 1 << DCCP_PKT_SYNCACK; unsigned int dccp_len = skb->len - dataoff; unsigned int cscov; const char *msg; + u8 type; + + BUILD_BUG_ON(DCCP_PKT_INVALID >= BITS_PER_LONG); if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || dh->dccph_doff * 4 > dccp_len) { @@ -457,10 +467,17 @@ static bool dccp_error(const struct dccp_hdr *dh, goto out_invalid; } - if (dh->dccph_type >= DCCP_PKT_INVALID) { + type = dh->dccph_type; + if (type >= DCCP_PKT_INVALID) { msg = "nf_ct_dccp: reserved packet type "; goto out_invalid; } + + if (test_bit(type, &require_seq48) && !dh->dccph_x) { + msg = "nf_ct_dccp: type lacks 48bit sequence numbers"; + goto out_invalid; + } + return false; out_invalid: nf_l4proto_log_invalid(skb, state->net, state->pf, @@ -468,24 +485,53 @@ out_invalid: return true; } +struct nf_conntrack_dccp_buf { + struct dccp_hdr dh; /* generic header part */ + struct dccp_hdr_ext ext; /* optional depending dh->dccph_x */ + union { /* depends on header type */ + struct dccp_hdr_ack_bits ack; + struct dccp_hdr_request req; + struct dccp_hdr_response response; + struct dccp_hdr_reset rst; + } u; +}; + +static struct dccp_hdr * +dccp_header_pointer(const struct sk_buff *skb, int offset, const struct dccp_hdr *dh, + struct nf_conntrack_dccp_buf *buf) +{ + unsigned int hdrlen = __dccp_hdr_len(dh); + + if (hdrlen > sizeof(*buf)) + return NULL; + + return skb_header_pointer(skb, offset, hdrlen, buf); +} + int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct dccp_hdr _dh, *dh; + struct nf_conntrack_dccp_buf _dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; unsigned int *timeouts; + struct dccp_hdr *dh; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); if (!dh) return NF_DROP; if (dccp_error(dh, skb, dataoff, state)) return -NF_ACCEPT; + /* pull again, including possible 48 bit sequences and subtype header */ + dh = dccp_header_pointer(skb, dataoff, dh, &_dh); + if (!dh) + return NF_DROP; + type = dh->dccph_type; if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh)) return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 6f9144e1f1c1..ee45dbf1b035 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -128,6 +128,56 @@ static void icmpv6_error_log(const struct sk_buff *skb, IPPROTO_ICMPV6, "%s", msg); } +static noinline_for_stack int +nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + u8 hl = ipv6_hdr(skb)->hop_limit; + union nf_inet_addr outer_daddr; + union { + struct nd_opt_hdr nd_opt; + struct rd_msg rd_msg; + } tmp; + const struct nd_opt_hdr *nd_opt; + const struct rd_msg *rd_msg; + + rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); + if (!rd_msg) { + icmpv6_error_log(skb, state, "short redirect"); + return -NF_ACCEPT; + } + + if (rd_msg->icmph.icmp6_code != 0) + return NF_ACCEPT; + + if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); + return -NF_ACCEPT; + } + + dataoff += sizeof(*rd_msg); + + /* warning: rd_msg no longer usable after this call */ + nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); + if (!nd_opt || nd_opt->nd_opt_len == 0) { + icmpv6_error_log(skb, state, "redirect without options"); + return -NF_ACCEPT; + } + + /* We could call ndisc_parse_options(), but it would need + * skb_linearize() and a bit more work. + */ + if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) + return NF_ACCEPT; + + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += 8; + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); +} + int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -158,6 +208,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, return NF_ACCEPT; } + if (icmp6h->icmp6_type == NDISC_REDIRECT) + return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); + /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 7626f3e1c70a..6b2a215b2786 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -27,22 +27,16 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR - - And so for me for SCTP :D -Kiran */ - static const char *const sctp_conntrack_names[] = { - "NONE", - "CLOSED", - "COOKIE_WAIT", - "COOKIE_ECHOED", - "ESTABLISHED", - "SHUTDOWN_SENT", - "SHUTDOWN_RECD", - "SHUTDOWN_ACK_SENT", - "HEARTBEAT_SENT", - "HEARTBEAT_ACKED", + [SCTP_CONNTRACK_NONE] = "NONE", + [SCTP_CONNTRACK_CLOSED] = "CLOSED", + [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", + [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", + [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", + [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", + [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", + [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; #define SECS * HZ @@ -54,12 +48,11 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, + [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, + [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, + [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, - [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 @@ -73,7 +66,6 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT -#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED #define sIV SCTP_CONNTRACK_MAX /* @@ -96,9 +88,6 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. -HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to - that of the HEARTBEAT chunk. Secondary connection is - established. */ /* TODO @@ -115,33 +104,33 @@ cookie echoed to closed. static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, -/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, -/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, +/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; @@ -310,7 +299,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; - } else { + } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", @@ -412,24 +401,34 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { - /* Sec 8.5.1 (A) */ + /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { - /* Sec 8.5.1 (B) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir]) + /* (B) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - /* Sec 8.5.1 (C) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir] && - sch->flags & SCTP_CHUNK_FLAG_T) + /* (C) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { - /* Sec 8.5.1 (D) */ + /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_COOKIE_ACK) { + ct->proto.sctp.init[dir] = 0; + ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); @@ -478,16 +477,18 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT || - sch->type == SCTP_CID_INIT_ACK) { - struct sctp_inithdr _inithdr, *ih; + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _ih, *ih; - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) goto out_unlock; - pr_debug("Setting vtag %x for dir %d\n", - ih->init_tag, !dir); + + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so @@ -498,11 +499,33 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; + } else if (sch->type == SCTP_CID_INIT_ACK) { + struct sctp_inithdr _ih, *ih; + __be32 vtag; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) + goto out_unlock; + + vtag = ct->proto.sctp.vtag[!dir]; + if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) + goto out_unlock; + /* collision */ + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && + vtag != ih->init_tag) + goto out_unlock; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; - if (old_state != new_state) + if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + if (new_state == SCTP_CONNTRACK_ESTABLISHED && + !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } } spin_unlock_bh(&ct->lock); @@ -516,14 +539,6 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && - dir == IP_CT_DIR_REPLY && - new_state == SCTP_CONNTRACK_ESTABLISHED) { - pr_debug("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_ASSURED, ct); - } - return NF_ACCEPT; out_unlock: diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b8cc3339a249..aed967e2f30f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1158,6 +1158,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } + + if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { + /* do not renew timeout on SYN retransmit. + * + * Else port reuse by client or NAT middlebox can keep + * entry alive indefinitely (including nat info). + */ + return NF_ACCEPT; + } + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index b83dc9bf0a5d..751df19fe0f8 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, return ret; if (ret == 0) break; - dataoff += *matchoff; + dataoff = *matchoff; } *in_header = 0; } @@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, break; if (ret == 0) return ret; - dataoff += *matchoff; + dataoff = *matchoff; } if (in_header) @@ -611,7 +611,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, start += strlen(name); *val = simple_strtoul(start, &end, 0); if (start == end) - return 0; + return -1; if (matchoff && matchlen) { *matchoff = start - dptr; *matchlen = end - start; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index a3faeacaa1cb..1e3dbed9d784 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -581,7 +581,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, - NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, @@ -851,12 +850,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { - .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { @@ -985,7 +978,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); - XASSIGN(HEARTBEAT_ACKED, sn); #undef XASSIGN #endif } @@ -1188,11 +1180,12 @@ static int __init nf_conntrack_standalone_init(void) nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif + nf_conntrack_init_end(); + ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; - nf_conntrack_init_end(); return 0; out_pernet: diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index bb25d4c794c7..e25dbeb220b4 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -203,11 +203,12 @@ void nf_logger_put(int pf, enum nf_log_type type) return; } - BUG_ON(loggers[pf][type] == NULL); - rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); - module_put(logger->me); + if (!logger) + WARN_ON_ONCE(1); + else + module_put(logger->me); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_logger_put); diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index f91579c821e9..5b37487d9d11 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -10,6 +10,7 @@ #include <linux/if.h> #include <linux/inetdevice.h> +#include <linux/in.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -24,81 +25,104 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_redirect.h> +static unsigned int +nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range, + const union nf_inet_addr *newdst) +{ + struct nf_nat_range2 newrange; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + + memset(&newrange, 0, sizeof(newrange)); + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr = *newdst; + newrange.max_addr = *newdst; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + unsigned int -nf_nat_redirect_ipv4(struct sk_buff *skb, - const struct nf_nat_ipv4_multi_range_compat *mr, +nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - __be32 newdst; - struct nf_nat_range2 newrange; + union nf_inet_addr newdst = {}; WARN_ON(hooknum != NF_INET_PRE_ROUTING && hooknum != NF_INET_LOCAL_OUT); - ct = nf_ct_get(skb, &ctinfo); - WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); - /* Local packets: make them go to loopback */ if (hooknum == NF_INET_LOCAL_OUT) { - newdst = htonl(0x7F000001); + newdst.ip = htonl(INADDR_LOOPBACK); } else { const struct in_device *indev; - newdst = 0; - indev = __in_dev_get_rcu(skb->dev); if (indev) { const struct in_ifaddr *ifa; ifa = rcu_dereference(indev->ifa_list); if (ifa) - newdst = ifa->ifa_local; + newdst.ip = ifa->ifa_local; } - if (!newdst) + if (!newdst.ip) return NF_DROP; } - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newdst; - newrange.max_addr.ip = newdst; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); + return nf_nat_redirect(skb, range, &newdst); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; +static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope) +{ + unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr); + + if (ifa_addr_type & IPV6_ADDR_MAPPED) + return false; + + if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC))) + return false; + + if (scope) { + unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK; + + if (!(scope & ifa_scope)) + return false; + } + + return true; +} + unsigned int nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_nat_range2 newrange; - struct in6_addr newdst; - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; + union nf_inet_addr newdst = {}; - ct = nf_ct_get(skb, &ctinfo); if (hooknum == NF_INET_LOCAL_OUT) { - newdst = loopback_addr; + newdst.in6 = loopback_addr; } else { + unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; bool addr = false; idev = __in6_dev_get(skb->dev); if (idev != NULL) { + const struct inet6_ifaddr *ifa; + read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { - newdst = ifa->addr; + if (!nf_nat_redirect_ipv6_usable(ifa, scope)) + continue; + + newdst.in6 = ifa->addr; addr = true; break; } @@ -109,12 +133,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, return NF_DROP; } - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.in6 = newdst; - newrange.max_addr.in6 = newdst; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); + return nf_nat_redirect(skb, range, &newdst); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58a7d89719b1..8131d858f38d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -20,15 +20,22 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/sock.h> #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) +#define NFT_SET_MAX_ANONLEN 16 + +unsigned int nf_tables_net_id __read_mostly; +EXPORT_SYMBOL_GPL(nf_tables_net_id); static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); static u64 table_handle; enum { @@ -67,7 +74,9 @@ static const struct rhashtable_params nft_objname_ht_params = { static void nft_validate_state_update(struct net *net, u8 new_validate_state) { - switch (net->nft.validate_state) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); break; @@ -78,11 +87,14 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) return; } - net->nft.validate_state = new_validate_state; + nft_net->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -113,6 +125,8 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, if (trans == NULL) return NULL; + INIT_LIST_HEAD(&trans->list); + INIT_LIST_HEAD(&trans->binding_list); trans->msg_type = msg_type; trans->ctx = *ctx; @@ -125,34 +139,67 @@ static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } -static void nft_trans_destroy(struct nft_trans *trans) +static void nft_trans_list_del(struct nft_trans *trans) { list_del(&trans->list); + list_del(&trans->binding_list); +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + nft_trans_list_del(trans); kfree(trans); } -static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, + bool bind) { + struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_set_is_anonymous(set)) return; - list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) - nft_trans_set_bound(trans) = true; + nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) - nft_trans_elem_set_bound(trans) = true; + nft_trans_elem_set_bound(trans) = bind; break; } } } +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, true); +} + +static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, false); +} + +static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) +{ + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + } + + list_add_tail(&trans->list, &nft_net->commit_list); +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) @@ -173,9 +220,10 @@ static int nf_tables_register_hook(struct net *net, return nf_register_net_hook(net, ops); } -static void nf_tables_unregister_hook(struct net *net, - const struct nft_table *table, - struct nft_chain *chain) +static void __nf_tables_unregister_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain, + bool release_netdev) { const struct nft_base_chain *basechain; const struct nf_hook_ops *ops; @@ -190,6 +238,16 @@ static void nf_tables_unregister_hook(struct net *net, return basechain->type->ops_unregister(net, ops); nf_unregister_net_hook(net, ops); + if (release_netdev && + table->family == NFPROTO_NETDEV) + nft_base_chain(chain)->ops.dev = NULL; +} + +static void nf_tables_unregister_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain) +{ + __nf_tables_unregister_hook(net, table, chain, false); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -203,7 +261,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) if (msg_type == NFT_MSG_NEWTABLE) nft_activate_next(ctx->net, ctx->table); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -230,7 +288,7 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) if (msg_type == NFT_MSG_NEWCHAIN) nft_activate_next(ctx->net, ctx->chain); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } @@ -242,7 +300,7 @@ static int nft_delchain(struct nft_ctx *ctx) if (IS_ERR(trans)) return PTR_ERR(trans); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nft_deactivate_next(ctx->net, ctx->chain); return 0; @@ -283,7 +341,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) /* You cannot delete the same rule twice */ if (nft_is_active_next(ctx->net, rule)) { nft_deactivate_next(ctx->net, rule); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); return 0; } return -ENOENT; @@ -303,7 +361,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } @@ -358,11 +416,32 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); + + return 0; +} + +static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_deactivate(ctx->net, set, elem); return 0; } +static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_deactivate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -371,8 +450,11 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -390,7 +472,7 @@ static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, obj); nft_trans_obj(trans) = obj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -404,7 +486,7 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; nft_deactivate_next(ctx->net, obj); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -423,7 +505,7 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, flowtable); nft_trans_flowtable(trans) = flowtable; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -438,7 +520,7 @@ static int nft_delflowtable(struct nft_ctx *ctx, return err; nft_deactivate_next(ctx->net, flowtable); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -451,13 +533,15 @@ static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, u8 family, u8 genmask) { + struct nftables_pernet *nft_net; struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(table, &net->nft.tables, list, - lockdep_is_held(&net->nft.commit_mutex)) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_rcu(table, &nft_net->tables, list, + lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) @@ -471,9 +555,11 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, u8 genmask) { + struct nftables_pernet *nft_net; struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && nft_active_genmask(table, genmask)) return table; @@ -525,6 +611,7 @@ struct nft_module_request { static int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; + struct nftables_pernet *nft_net; struct nft_module_request *req; va_list args; int ret; @@ -535,7 +622,8 @@ static int nft_request_module(struct net *net, const char *fmt, ...) if (ret >= MODULE_NAME_LEN) return 0; - list_for_each_entry(req, &net->nft.module_list, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) return 0; @@ -551,7 +639,7 @@ static int nft_request_module(struct net *net, const char *fmt, ...) req->done = false; strlcpy(req->module, module_name, MODULE_NAME_LEN); - list_add_tail(&req->list, &net->nft.module_list); + list_add_tail(&req->list, &nft_net->module_list); return -EAGAIN; } @@ -587,6 +675,13 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } +static __be16 nft_base_seq(const struct net *net) +{ + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + return htons(nft_net->base_seq & 0xffff); +} + static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, @@ -599,20 +694,16 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || - nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || + nla_put_be32(skb, NFTA_TABLE_FLAGS, + htonl(table->flags & NFT_TABLE_F_MASK)) || nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), NFTA_TABLE_PAD)) @@ -657,15 +748,17 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -769,7 +862,7 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); + nf_tables_unregister_hook(net, table, chain); } } @@ -784,7 +877,7 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nf_register_net_hook(net, &nft_base_chain(chain)->ops); + err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err; @@ -799,14 +892,40 @@ err: static void nf_tables_table_disable(struct net *net, struct nft_table *table) { + table->flags &= ~NFT_TABLE_F_DORMANT; nft_table_disable(net, table, 0); + table->flags |= NFT_TABLE_F_DORMANT; +} + +#define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1) +#define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0) +#define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1) +#define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ + __NFT_TABLE_F_WAS_AWAKEN) + +static bool nft_table_pending_update(const struct nft_ctx *ctx) +{ + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nft_trans *trans; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return true; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->ctx.table == ctx->table && + trans->msg_type == NFT_MSG_DELCHAIN && + nft_is_base_chain(trans->ctx.chain)) + return true; + } + + return false; } static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; u32 flags; - int ret = 0; + int ret; if (!ctx->nla[NFTA_TABLE_FLAGS]) return 0; @@ -815,9 +934,13 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if (flags & ~NFT_TABLE_F_DORMANT) return -EINVAL; - if (flags == ctx->table->flags) + if (flags == (ctx->table->flags & NFT_TABLE_F_MASK)) return 0; + /* No dormant off/on/off/on games in single transaction */ + if (nft_table_pending_update(ctx)) + return -EINVAL; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) @@ -825,22 +948,28 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if ((flags & NFT_TABLE_F_DORMANT) && !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { - nft_trans_table_enable(trans) = false; + ctx->table->flags |= NFT_TABLE_F_DORMANT; + if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) + ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(ctx->net, ctx->table); - if (ret >= 0) { - ctx->table->flags &= ~NFT_TABLE_F_DORMANT; - nft_trans_table_enable(trans) = true; + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) { + ret = nf_tables_table_enable(ctx->net, ctx->table); + if (ret < 0) + goto err_register_hooks; + + ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT; } } - if (ret < 0) - goto err; nft_trans_table_update(trans) = true; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); + return 0; -err: + +err_register_hooks: + ctx->table->flags |= NFT_TABLE_F_DORMANT; nft_trans_destroy(trans); return ret; } @@ -896,11 +1025,36 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg, return strcmp(obj->key.name, k->name); } +static bool nft_supported_family(u8 family) +{ + return false +#ifdef CONFIG_NF_TABLES_INET + || family == NFPROTO_INET +#endif +#ifdef CONFIG_NF_TABLES_IPV4 + || family == NFPROTO_IPV4 +#endif +#ifdef CONFIG_NF_TABLES_ARP + || family == NFPROTO_ARP +#endif +#ifdef CONFIG_NF_TABLES_NETDEV + || family == NFPROTO_NETDEV +#endif +#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) + || family == NFPROTO_BRIDGE +#endif +#ifdef CONFIG_NF_TABLES_IPV6 + || family == NFPROTO_IPV6 +#endif + ; +} + static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -910,7 +1064,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int err; - lockdep_assert_held(&net->nft.commit_mutex); + if (!nft_supported_family(family)) + return -EOPNOTSUPP; + + lockdep_assert_held(&nft_net->commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask); if (IS_ERR(table)) { @@ -960,7 +1117,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (err < 0) goto err_trans; - list_add_tail_rcu(&table->list, &net->nft.tables); + list_add_tail_rcu(&table->list, &nft_net->tables); return 0; err_trans: rhltable_destroy(&table->chains_ht); @@ -995,8 +1152,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, set)) continue; - if (nft_set_is_anonymous(set) && - !list_empty(&set->bindings)) + if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); @@ -1040,11 +1196,12 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_table *table, *nt; const struct nlattr * const *nla = ctx->nla; int err = 0; - list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (family != AF_UNSPEC && table->family != family) continue; @@ -1158,7 +1315,9 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING - return lockdep_is_held(&net->nft.commit_mutex); + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + return lockdep_is_held(&nft_net->commit_mutex); #else return true; #endif @@ -1262,18 +1421,13 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_chain *chain) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), @@ -1366,11 +1520,13 @@ static int nf_tables_dump_chains(struct sk_buff *skb, unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -1556,12 +1712,13 @@ static int nft_chain_parse_hook(struct net *net, struct nft_chain_hook *hook, u8 family, bool autoload) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; struct net_device *dev; int err; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX, @@ -1662,13 +1819,13 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_rule **rules; int err; - if (table->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; struct nf_hook_ops *ops; + if (table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) return err; @@ -1741,6 +1898,11 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) goto err1; + if (!nft_use_inc(&table->use)) { + err = -EMFILE; + goto err_use; + } + err = rhltable_insert_key(&table->chains_ht, chain->name, &chain->rhlhead, nft_chain_ht_params); if (err) @@ -1758,11 +1920,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; - table->use++; list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: + nft_use_dec_restore(&table->use); +err_use: nf_tables_unregister_hook(net, table, chain); err1: nf_tables_chain_destroy(ctx); @@ -1846,6 +2009,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_trans *tmp; char *name; @@ -1855,7 +2019,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, goto err; err = -EEXIST; - list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) { + list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && tmp->ctx.table == table && nft_trans_chain_update(tmp) && @@ -1868,7 +2032,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_chain_name(trans) = name; } - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err: @@ -1882,6 +2046,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -1893,7 +2058,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, u64 handle = 0; u32 flags = 0; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); if (IS_ERR(table)) { @@ -2074,7 +2239,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -2106,9 +2271,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -2352,20 +2521,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, const struct nft_rule *prule) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; const struct nft_expr *expr, *next; struct nlattr *list; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, + nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) @@ -2486,11 +2650,13 @@ static int nf_tables_dump_rules(struct sk_buff *skb, unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -2707,12 +2873,15 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) err = nft_chain_validate(&ctx, chain); if (err < 0) return err; + + cond_resched(); } return 0; } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla); #define NFT_RULE_MAXEXPRS 128 @@ -2722,6 +2891,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); struct nft_expr_info *info = NULL; @@ -2739,7 +2909,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, int err, rem; u64 handle, pos_handle; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); if (IS_ERR(table)) { @@ -2775,9 +2945,6 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return -EINVAL; handle = nf_tables_alloc_handle(table); - if (chain->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); old_rule = __nft_rule_lookup(chain, pos_handle); @@ -2786,7 +2953,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(old_rule); } } else if (nla[NFTA_RULE_POSITION_ID]) { - old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]); + old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]); return PTR_ERR(old_rule); @@ -2859,16 +3026,21 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, expr = nft_expr_next(expr); } + if (!nft_use_inc(&chain->use)) { + err = -EMFILE; + goto err2; + } + if (nlh->nlmsg_flags & NLM_F_REPLACE) { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } err = nft_delrule(&ctx, old_rule); if (err < 0) { nft_trans_destroy(trans); - goto err2; + goto err_destroy_flow_rule; } list_add_tail_rcu(&rule->list, &old_rule->list); @@ -2876,7 +3048,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (!trans) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } if (nlh->nlmsg_flags & NLM_F_APPEND) { @@ -2892,9 +3064,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } } kvfree(info); - chain->use++; - if (net->nft.validate_state == NFT_VALIDATE_DO) + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { @@ -2906,8 +3077,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } return 0; + +err_destroy_flow_rule: + nft_use_dec_restore(&chain->use); err2: - nf_tables_rule_release(&ctx, rule); + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); + nf_tables_rule_destroy(&ctx, rule); err1: for (i = 0; i < n; i++) { if (info[i].ops) { @@ -2921,15 +3096,18 @@ err1: } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { struct nft_rule *rule = nft_trans_rule(trans); if (trans->msg_type == NFT_MSG_NEWRULE && + trans->ctx.chain == chain && id == nft_trans_rule_id(trans)) return rule; } @@ -2976,7 +3154,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, err = nft_delrule(&ctx, rule); } else if (nla[NFTA_RULE_ID]) { - rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); + rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]); return PTR_ERR(rule); @@ -3044,12 +3222,13 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, enum nft_set_policies policy) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; u32 flags = 0; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { @@ -3191,16 +3370,19 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, } static struct nft_set *nft_set_lookup_byid(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWSET) { struct nft_set *set = nft_trans_set(trans); if (id == nft_trans_set_id(trans) && + set->table == table && nft_active_genmask(set, genmask)) return set; } @@ -3221,7 +3403,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net, if (!nla_set_id) return set; - set = nft_set_lookup_byid(net, nla_set_id, genmask); + set = nft_set_lookup_byid(net, table, nla_set_id, genmask); } return set; } @@ -3240,6 +3422,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; + if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) + return -EINVAL; + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; @@ -3247,7 +3432,7 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; - if (!nft_is_active_next(ctx->net, set)) + if (!nft_is_active_next(ctx->net, i)) continue; if (!sscanf(i->name, name, &tmp)) continue; @@ -3303,23 +3488,17 @@ __be64 nf_jiffies64_to_msecs(u64 input) static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *desc; u32 portid = ctx->portid; u32 seq = ctx->seq; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, + NFNETLINK_V0, nft_base_seq(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -3415,14 +3594,16 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); struct nft_ctx *ctx = cb->data, ctx_set; + struct nftables_pernet *nft_net; if (cb->args[1]) return skb->len; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && ctx->family != table->family) continue; @@ -3613,6 +3794,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; + if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) == + (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == + (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; } dtype = 0; @@ -3654,6 +3841,9 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); if (err) return err; @@ -3662,6 +3852,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } @@ -3715,10 +3909,15 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (ops->privsize != NULL) size = ops->privsize(nla, &desc); + if (!nft_use_inc(&table->use)) { + err = -EMFILE; + goto err1; + } + set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); if (!set) { err = -ENOMEM; - goto err1; + goto err_alloc; } name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); @@ -3739,6 +3938,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -3765,29 +3965,38 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, goto err4; list_add_tail_rcu(&set->list, &table->sets); - table->use++; + return 0; err4: - ops->destroy(set); + ops->destroy(&ctx, set); err3: kfree(set->name); err2: kvfree(set); +err_alloc: + nft_use_dec_restore(&table->use); err1: module_put(to_set_type(ops)->owner); return err; } -static void nft_set_destroy(struct nft_set *set) +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + +static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { if (WARN_ON(set->use > 0)) return; - set->ops->destroy(set); + set->ops->destroy(ctx, set); module_put(to_set_type(set->ops)->owner); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct net *net, struct sock *nlsk, @@ -3833,6 +4042,12 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return nft_delset(&ctx, set); } +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len); + static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -3854,9 +4069,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *i; struct nft_set_iter iter; - if (set->use == UINT_MAX) - return -EOVERFLOW; - if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; @@ -3881,10 +4093,12 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, return iter.err; } bind: + if (!nft_use_inc(&set->use)) + return -EMFILE; + binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); nft_set_trans_bind(ctx, set); - set->use++; return 0; } @@ -3897,23 +4111,81 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); } } +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_activate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_activate(ctx->net, set, elem); + + return 0; +} + +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); +} + +void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) +{ + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(ctx, set); + + nft_clear(ctx->net, set); + } + + nft_use_inc_restore(&set->use); +} +EXPORT_SYMBOL_GPL(nf_tables_activate_set); + void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nft_set_trans_unbind(ctx, set); + if (nft_set_is_anonymous(set)) + nft_deactivate_next(ctx->net, set); + else + list_del_rcu(&binding->list); + + nft_use_dec(&set->use); + break; case NFT_TRANS_PREPARE: - set->use--; + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_deactivate_next(ctx->net, set); + } + nft_use_dec(&set->use); return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - set->use--; + if (nft_set_is_anonymous(set) && + set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_use_dec(&set->use); /* fall through */ default: nf_tables_unbind_set(ctx, set, binding, @@ -3925,7 +4197,7 @@ EXPORT_SYMBOL_GPL(nf_tables_deactivate_set); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) { if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) - nft_set_destroy(set); + nft_set_destroy(ctx, set); } EXPORT_SYMBOL_GPL(nf_tables_destroy_set); @@ -4095,8 +4367,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_set_elem *elem) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); struct nft_set_dump_args *args; + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) + return 0; + args = container_of(iter, struct nft_set_dump_args, iter); return nf_tables_fill_setelem(args->skb, set, elem); } @@ -4110,18 +4386,19 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); + struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; bool set_found = false; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; int event; rcu_read_lock(); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) continue; @@ -4147,16 +4424,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - NLM_F_MULTI); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, + table->family, NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = table->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) @@ -4213,22 +4485,16 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; int err; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, + NFNETLINK_V0, nft_base_seq(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -4268,11 +4534,54 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } +static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data *key, struct nlattr *attr) +{ + struct nft_data_desc desc; + int err; + + err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); + if (err < 0) + return err; + + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { + nft_data_release(key, desc.type); + return -EINVAL; + } + + return 0; +} + +static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data_desc *desc, + struct nft_data *data, + struct nlattr *attr) +{ + u32 dtype; + int err; + + err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); + if (err < 0) + return err; + + if (set->dtype == NFT_DATA_VERDICT) + dtype = NFT_DATA_VERDICT; + else + dtype = NFT_DATA_VALUE; + + if (dtype != desc->type || + set->dlen != desc->len) { + nft_data_release(data, desc->type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; - struct nft_data_desc desc; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; @@ -4291,17 +4600,11 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(&elem.key.val, desc.type); - return err; - } - priv = set->ops->get(ctx->net, set, &elem, flags); if (IS_ERR(priv)) return PTR_ERR(priv); @@ -4448,6 +4751,7 @@ void *nft_set_elem_init(const struct nft_set *set, return elem; } +/* Drop references and destroy. Called from gc, dynset and abort path. */ void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -4471,16 +4775,16 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } } if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_set_elem_deactivate() already deals with - * the refcounting from the preparation phase. +/* Destroy element. References have been already dropped in the preparation + * path via nft_setelem_data_deactivate(). */ -static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); @@ -4488,20 +4792,20 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext)); kfree(elem); } +EXPORT_SYMBOL_GPL(nf_tables_set_elem_destroy); static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); - struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; struct nft_userdata *udata; - struct nft_data data; + struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; u32 flags = 0; @@ -4535,6 +4839,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return -EINVAL; } + if (set->flags & NFT_SET_OBJECT) { + if (!nla[NFTA_SET_ELEM_OBJREF] && + !(flags & NFT_SET_ELEM_INTERVAL_END)) + return -EINVAL; + } else { + if (nla[NFTA_SET_ELEM_OBJREF]) + return -EINVAL; + } + if ((flags & NFT_SET_ELEM_INTERVAL_END) && (nla[NFTA_SET_ELEM_DATA] || nla[NFTA_SET_ELEM_OBJREF] || @@ -4566,15 +4879,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; - err = -EINVAL; - if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) - goto err2; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -4582,30 +4892,30 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { - if (!(set->flags & NFT_SET_OBJECT)) { - err = -EINVAL; - goto err2; - } obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); + obj = NULL; goto err2; } + + if (!nft_use_inc(&obj->use)) { + err = -EMFILE; + obj = NULL; + goto err2; + } + nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &d2, - nla[NFTA_SET_ELEM_DATA]); + err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val, + nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err2; - err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) - goto err3; - dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { @@ -4619,19 +4929,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, continue; err = nft_validate_register_store(&bind_ctx, dreg, - &data, - d2.type, d2.len); + &elem.data.val, + desc.type, desc.len); if (err < 0) goto err3; - if (d2.type == NFT_DATA_VERDICT && - (data.verdict.code == NFT_GOTO || - data.verdict.code == NFT_JUMP)) + if (desc.type == NFT_DATA_VERDICT && + (elem.data.val.verdict.code == NFT_GOTO || + elem.data.val.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); } /* The full maximum length of userdata can exceed the maximum @@ -4647,8 +4957,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, - timeout, expiration, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.data.val.data, timeout, expiration, + GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -4660,16 +4971,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } - if (obj) { + if (obj) *nft_set_ext_obj(ext) = obj; - obj->use++; - } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) goto err4; - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; + ext->genmask = nft_genmask_cur(ctx->net); + err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { @@ -4701,7 +5011,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err6: @@ -4709,14 +5019,15 @@ err6: err5: kfree(trans); err4: - if (obj) - obj->use--; kfree(elem.priv); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, d2.type); + nft_data_release(&elem.data.val, desc.type); err2: - nft_data_release(&elem.key.val, d1.type); + if (obj) + nft_use_dec_restore(&obj->use); + + nft_data_release(&elem.key.val, NFT_DATA_VALUE); err1: return err; } @@ -4726,6 +5037,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; @@ -4745,7 +5057,8 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { @@ -4754,7 +5067,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return err; } - if (net->nft.validate_state == NFT_VALIDATE_DO) + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, ctx.table); return 0; @@ -4773,46 +5086,49 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { + struct nft_chain *chain; + if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use++; + chain = data->verdict.chain; + nft_use_inc_restore(&chain->use); break; } } } -static void nft_set_elem_activate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use++; + nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_set_elem_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); } +EXPORT_SYMBOL_GPL(nft_setelem_data_deactivate); static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; - struct nft_data_desc desc; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; @@ -4823,11 +5139,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) - goto err1; + return err; - err = -EINVAL; if (nla[NFTA_SET_ELEM_KEY] == NULL) - goto err1; + return -EINVAL; nft_set_ext_prepare(&tmpl); @@ -4837,54 +5152,47 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; - - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - goto err2; + return err; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, 0, GFP_KERNEL); if (elem.priv == NULL) - goto err2; + goto fail_elem; ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) { - err = -ENOMEM; - goto err3; - } + if (trans == NULL) + goto fail_trans; priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; - goto err4; + goto fail_ops; } kfree(elem.priv); elem.priv = priv; - nft_set_elem_deactivate(ctx->net, set, &elem); + nft_setelem_data_deactivate(ctx->net, set, &elem); nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; -err4: +fail_ops: kfree(trans); -err3: +fail_trans: kfree(elem.priv); -err2: - nft_data_release(&elem.key.val, desc.type); -err1: +fail_elem: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } @@ -4907,10 +5215,10 @@ static int nft_flush_set(const struct nft_ctx *ctx, } set->ndeact++; - nft_set_elem_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem); nft_trans_elem_set(trans) = set; nft_trans_elem(trans) = *elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err1: @@ -4937,7 +5245,11 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + + if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) { @@ -4960,31 +5272,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, return err; } -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} -EXPORT_SYMBOL_GPL(nft_set_gc_batch_release); - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; -} -EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc); - /* * Stateful objects */ @@ -5207,7 +5494,7 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, nft_trans_obj(trans) = obj; nft_trans_obj_update(trans) = true; nft_trans_obj_newobj(trans) = newobj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -5268,9 +5555,14 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + if (!nft_use_inc(&table->use)) + return -EMFILE; + type = nft_obj_type_get(net, objtype); - if (IS_ERR(type)) - return PTR_ERR(type); + if (IS_ERR(type)) { + err = PTR_ERR(type); + goto err_type; + } obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { @@ -5296,7 +5588,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, goto err4; list_add_tail_rcu(&obj->list, &table->objects); - table->use++; + return 0; err4: /* queued in transaction log */ @@ -5310,6 +5602,9 @@ err2: kfree(obj); err1: module_put(type->owner); +err_type: + nft_use_dec_restore(&table->use); + return err; } @@ -5318,19 +5613,14 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table, struct nft_object *obj, bool reset) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || @@ -5361,6 +5651,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) struct nft_obj_filter *filter = cb->data; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; struct nft_object *obj; bool reset = false; @@ -5368,9 +5659,10 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) reset = true; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -5653,10 +5945,11 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - flowtable->use--; + nft_use_dec(&flowtable->use); /* fall through */ default: return; @@ -5773,11 +6066,12 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, return err; } +/* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - list_for_each_entry(type, &nf_tables_flowtables, list) { + list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } @@ -5789,9 +6083,13 @@ nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; + rcu_read_lock(); type = __nft_flowtable_type_get(family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -5803,8 +6101,9 @@ nft_flowtable_type_get(struct net *net, u8 family) return ERR_PTR(-ENOENT); } -static void nft_unregister_flowtable_net_hooks(struct net *net, - struct nft_flowtable *flowtable) +static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, + bool release_netdev) { int i; @@ -5813,9 +6112,17 @@ static void nft_unregister_flowtable_net_hooks(struct net *net, continue; nf_unregister_net_hook(net, &flowtable->ops[i]); + if (release_netdev) + flowtable->ops[i].dev = NULL; } } +static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable) +{ + __nft_unregister_flowtable_net_hooks(net, flowtable, false); +} + static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -5862,9 +6169,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + if (!nft_use_inc(&table->use)) + return -EMFILE; + flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); - if (!flowtable) - return -ENOMEM; + if (!flowtable) { + err = -ENOMEM; + goto flowtable_alloc; + } flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); @@ -5896,6 +6208,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, continue; list_for_each_entry(ft, &table->flowtables, list) { + if (!nft_is_active_next(net, ft)) + continue; + for (k = 0; k < ft->ops_len; k++) { if (!ft->ops[k].dev) continue; @@ -5918,7 +6233,6 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err6; list_add_tail_rcu(&flowtable->list, &table->flowtables); - table->use++; return 0; err6: @@ -5936,6 +6250,9 @@ err2: kfree(flowtable->name); err1: kfree(flowtable); +flowtable_alloc: + nft_use_dec_restore(&table->use); + return err; } @@ -5993,20 +6310,15 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, struct nft_flowtable *flowtable) { struct nlattr *nest, *nest_devs; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || @@ -6055,13 +6367,15 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; struct nft_flowtable *flowtable; const struct nft_table *table; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -6231,21 +6545,17 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -6278,6 +6588,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; + struct nftables_pernet *nft_net; struct nft_table *table; struct net *net; @@ -6285,13 +6596,14 @@ static int nf_tables_flowtable_event(struct notifier_block *this, return 0; net = dev_net(dev); - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } @@ -6472,19 +6784,22 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { static int nf_tables_validate(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table; - switch (net->nft.validate_state) { + switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: break; case NFT_VALIDATE_NEED: nft_validate_state_update(net, NFT_VALIDATE_DO); /* fall through */ case NFT_VALIDATE_DO: - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_validate(net, table) < 0) return -EAGAIN; } + + nft_validate_state_update(net, NFT_VALIDATE_SKIP); break; } @@ -6570,13 +6885,10 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: - nft_set_destroy(nft_trans_set(trans)); + nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: nf_tables_set_elem_destroy(&trans->ctx, @@ -6612,7 +6924,7 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } @@ -6656,9 +6968,10 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha static void nf_tables_commit_chain_prepare_cancel(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { struct nft_chain *chain = trans->ctx.chain; if (trans->msg_type == NFT_MSG_NEWRULE || @@ -6754,12 +7067,204 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + trans->set->ops->remove(trans->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} +EXPORT_SYMBOL_GPL(nft_trans_gc_destroy); + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = net_generic(trans->net, nf_tables_net_id); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_gc_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + + refcount_inc(&set->refs); + trans->set = set; + trans->seq = gc_seq; + + return trans; +} +EXPORT_SYMBOL_GPL(nft_trans_gc_alloc); + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} +EXPORT_SYMBOL_GPL(nft_trans_gc_elem_add); + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + struct nft_set *set; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(set, gc_seq, gfp); +} +EXPORT_SYMBOL_GPL(nft_trans_gc_queue_async); + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} +EXPORT_SYMBOL_GPL(nft_trans_gc_queue_async_done); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + struct nft_set *set; + + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(set, 0, gfp); +} +EXPORT_SYMBOL_GPL(nft_trans_gc_queue_sync); + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} +EXPORT_SYMBOL_GPL(nft_trans_gc_queue_sync_done); + static void nf_tables_module_autoload_cleanup(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_module_request *req, *next; - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); - list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + list_for_each_entry_safe(req, next, &nft_net->module_list, list) { WARN_ON_ONCE(!req->done); list_del(&req->list); kfree(req); @@ -6768,6 +7273,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net) static void nf_tables_commit_release(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; /* all side effects have to be made visible. @@ -6777,41 +7283,71 @@ static void nf_tables_commit_release(struct net *net) * Memory reclaim happens asynchronously from work queue * to prevent expensive synchronize_rcu() in commit phase. */ - if (list_empty(&net->nft.commit_list)) { + if (list_empty(&nft_net->commit_list)) { nf_tables_module_autoload_cleanup(net); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return; } - trans = list_last_entry(&net->nft.commit_list, + trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); get_net(trans->ctx.net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); - list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); + list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); schedule_work(&trans_destroy_work); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); +} + +static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); } static int nf_tables_commit(struct net *net, struct sk_buff *skb) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + unsigned int gc_seq; int err; - if (list_empty(&net->nft.commit_list)) { - mutex_unlock(&net->nft.commit_mutex); + if (list_empty(&nft_net->commit_list)) { + mutex_unlock(&nft_net->commit_mutex); return 0; } + list_for_each_entry(trans, &nft_net->binding_list, binding_list) { + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ if (nf_tables_validate(net) < 0) return -EAGAIN; @@ -6821,7 +7357,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return err; /* 1. Allocate space for next generation rules_gen_X[] */ - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { int ret; if (trans->msg_type == NFT_MSG_NEWRULE || @@ -6837,7 +7373,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } /* step 2. Make rules_gen_X visible to packet path */ - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(chain, &table->chains, list) nf_tables_commit_chain(net, chain); } @@ -6846,20 +7382,26 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - while (++net->nft.base_seq == 0); + while (++nft_net->base_seq == 0) + ; + + gc_seq = nft_gc_seq_begin(nft_net); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) { + nft_trans_destroy(trans); + break; } + if (trans->ctx.table->flags & NFT_TABLE_F_DORMANT) + nf_tables_table_disable(net, trans->ctx.table); + + trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; } else { nft_clear(net, trans->ctx.table); } @@ -6907,6 +7449,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); + + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: nft_clear(net, nft_trans_set(trans)); @@ -6915,13 +7460,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ if (nft_set_is_anonymous(nft_trans_set(trans)) && !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + nft_use_dec(&trans->ctx.table->use); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_DELSET, GFP_KERNEL); @@ -6983,6 +7529,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + + nft_gc_seq_end(nft_net, gc_seq); nf_tables_commit_release(net); return 0; @@ -6990,17 +7538,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) static void nf_tables_module_autoload(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_module_request *req, *next; LIST_HEAD(module_list); - list_splice_init(&net->nft.module_list, &module_list); - mutex_unlock(&net->nft.commit_mutex); + list_splice_init(&nft_net->module_list, &module_list); + mutex_unlock(&nft_net->commit_mutex); list_for_each_entry_safe(req, next, &module_list, list) { request_module("%s", req->module); req->done = true; } - mutex_lock(&net->nft.commit_mutex); - list_splice(&module_list, &net->nft.module_list); + mutex_lock(&nft_net->commit_mutex); + list_splice(&module_list, &nft_net->module_list); } static void nf_tables_abort_release(struct nft_trans *trans) @@ -7016,7 +7565,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: - nft_set_destroy(nft_trans_set(trans)); + nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), @@ -7034,23 +7583,31 @@ static void nf_tables_abort_release(struct nft_trans *trans) static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; - list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, + list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); + if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) { + nft_trans_destroy(trans); + break; + } + if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_DORMANT) { + nf_tables_table_disable(net, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) { + trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT; } + trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; nft_trans_destroy(trans); } else { list_del_rcu(&trans->ctx.table->list); @@ -7066,7 +7623,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, @@ -7074,34 +7631,37 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELCHAIN: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: - trans->ctx.chain->use--; + nft_use_dec_restore(&trans->ctx.chain->use); list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); break; case NFT_MSG_DELRULE: - trans->ctx.chain->use++; + nft_use_inc_restore(&trans->ctx.chain->use); nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; } + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&trans->ctx, nft_trans_set(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -7116,7 +7676,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_set_elem_activate(net, te->set, &te->elem); + nft_setelem_data_activate(net, te->set, &te->elem); te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; @@ -7127,23 +7687,23 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); nft_obj_del(nft_trans_obj(trans)); } break; case NFT_MSG_DELOBJ: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans)); break; case NFT_MSG_DELFLOWTABLE: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); nft_trans_destroy(trans); break; @@ -7153,43 +7713,50 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, - &net->nft.commit_list, list) { - list_del(&trans->list); + &nft_net->commit_list, list) { + nft_trans_list_del(trans); nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - - return 0; -} - -static void nf_tables_cleanup(struct net *net) -{ - nft_validate_state_update(net, NFT_VALIDATE_SKIP); + return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { - int ret = __nf_tables_abort(net, action); + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + unsigned int gc_seq; + int ret; + + gc_seq = nft_gc_seq_begin(nft_net); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(nft_net, gc_seq); - mutex_unlock(&net->nft.commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + + mutex_unlock(&nft_net->commit_mutex); return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); bool genid_ok; - mutex_lock(&net->nft.commit_mutex); + mutex_lock(&nft_net->commit_mutex); - genid_ok = genid == 0 || net->nft.base_seq == genid; + genid_ok = genid == 0 || nft_net->base_seq == genid; if (!genid_ok) - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); /* else, commit mutex has to be released by commit or abort function */ return genid_ok; @@ -7202,7 +7769,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, - .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, .owner = THIS_MODULE, }; @@ -7364,28 +7930,24 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) } EXPORT_SYMBOL_GPL(nft_parse_u32_check); -/** - * nft_parse_register - parse a register value from a netlink attribute - * - * @attr: netlink attribute - * - * Parse and translate a register value from a netlink attribute. - * Registers used to be 128 bit wide, these register numbers will be - * mapped to the corresponding 32 bit register numbers. - */ -unsigned int nft_parse_register(const struct nlattr *attr) +static int nft_parse_register(const struct nlattr *attr, u32 *preg) { unsigned int reg; reg = ntohl(nla_get_be32(attr)); switch (reg) { case NFT_REG_VERDICT...NFT_REG_4: - return reg * NFT_REG_SIZE / NFT_REG32_SIZE; + *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE; + break; + case NFT_REG32_00...NFT_REG32_15: + *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + break; default: - return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + return -ERANGE; } + + return 0; } -EXPORT_SYMBOL_GPL(nft_parse_register); /** * nft_dump_register - dump a register value to a netlink attribute @@ -7418,7 +7980,7 @@ EXPORT_SYMBOL_GPL(nft_dump_register); * Validate that the input register is one of the general purpose * registers and that the length of the load is within the bounds. */ -int nft_validate_register_load(enum nft_registers reg, unsigned int len) +static int nft_validate_register_load(enum nft_registers reg, unsigned int len) { if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; @@ -7429,7 +7991,24 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len) return 0; } -EXPORT_SYMBOL_GPL(nft_validate_register_load); + +int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) +{ + u32 reg; + int err; + + err = nft_parse_register(attr, ®); + if (err < 0) + return err; + + err = nft_validate_register_load(reg, len); + if (err < 0) + return err; + + *sreg = reg; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_register_load); /** * nft_validate_register_store - validate an expressions' register store @@ -7445,10 +8024,11 @@ EXPORT_SYMBOL_GPL(nft_validate_register_load); * A value of NULL for the data means that its runtime gathered * data. */ -int nft_validate_register_store(const struct nft_ctx *ctx, - enum nft_registers reg, - const struct nft_data *data, - enum nft_data_types type, unsigned int len) +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len) { int err; @@ -7480,7 +8060,27 @@ int nft_validate_register_store(const struct nft_ctx *ctx, return 0; } } -EXPORT_SYMBOL_GPL(nft_validate_register_store); + +int nft_parse_register_store(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *dreg, + const struct nft_data *data, + enum nft_data_types type, unsigned int len) +{ + int err; + u32 reg; + + err = nft_parse_register(attr, ®); + if (err < 0) + return err; + + err = nft_validate_register_store(ctx, reg, data, type, len); + if (err < 0) + return err; + + *dreg = reg; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_register_store); static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, @@ -7503,19 +8103,16 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CODE]) return -EINVAL; + + /* zero padding hole for memcmp */ + memset(data, 0, sizeof(*data)); data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict.code) { - default: - switch (data->verdict.code & NF_VERDICT_MASK) { - case NF_ACCEPT: - case NF_DROP: - case NF_QUEUE: - break; - default: - return -EINVAL; - } - /* fall through */ + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: @@ -7530,10 +8127,13 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, return PTR_ERR(chain); if (nft_is_base_chain(chain)) return -EOPNOTSUPP; + if (!nft_use_inc(&chain->use)) + return -EMFILE; - chain->use++; data->verdict.chain = chain; break; + default: + return -EINVAL; } desc->len = sizeof(data->verdict); @@ -7543,10 +8143,13 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { + struct nft_chain *chain; + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use--; + chain = data->verdict.chain; + nft_use_dec(&chain->use); break; } } @@ -7700,32 +8303,40 @@ int __nft_release_basechain(struct nft_ctx *ctx) nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); nf_tables_rule_release(ctx, rule); } nft_chain_del(ctx->chain); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nf_tables_chain_destroy(ctx); return 0; } EXPORT_SYMBOL_GPL(__nft_release_basechain); +static void __nft_release_hook(struct net *net, struct nft_table *table) +{ + struct nft_flowtable *flowtable; + struct nft_chain *chain; + + list_for_each_entry(chain, &table->chains, list) + __nf_tables_unregister_hook(net, table, chain, true); + list_for_each_entry(flowtable, &table->flowtables, list) + __nft_unregister_flowtable_net_hooks(net, flowtable, true); +} + static void __nft_release_hooks(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table; - struct nft_chain *chain; - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hook(net, table, chain); - } + list_for_each_entry(table, &nft_net->tables, list) + __nft_release_hook(net, table); } -static void __nft_release_tables(struct net *net) +static void __nft_release_table(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable, *nf; - struct nft_table *table, *nt; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_rule *rule, *nr; @@ -7735,75 +8346,114 @@ static void __nft_release_tables(struct net *net) .family = NFPROTO_NETDEV, }; - list_for_each_entry_safe(table, nt, &net->nft.tables, list) { - ctx.family = table->family; - ctx.table = table; - list_for_each_entry(chain, &table->chains, list) { - ctx.chain = chain; - list_for_each_entry_safe(rule, nr, &chain->rules, list) { - list_del(&rule->list); - chain->use--; - nf_tables_rule_release(&ctx, rule); - } - } - list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { - list_del(&flowtable->list); - table->use--; - nf_tables_flowtable_destroy(flowtable); - } - list_for_each_entry_safe(set, ns, &table->sets, list) { - list_del(&set->list); - table->use--; - nft_set_destroy(set); - } - list_for_each_entry_safe(obj, ne, &table->objects, list) { - nft_obj_del(obj); - table->use--; - nft_obj_destroy(&ctx, obj); - } - list_for_each_entry_safe(chain, nc, &table->chains, list) { - ctx.chain = chain; - nft_chain_del(chain); - table->use--; - nf_tables_chain_destroy(&ctx); + ctx.family = table->family; + ctx.table = table; + list_for_each_entry(chain, &table->chains, list) { + ctx.chain = chain; + list_for_each_entry_safe(rule, nr, &chain->rules, list) { + list_del(&rule->list); + nft_use_dec(&chain->use); + nf_tables_rule_release(&ctx, rule); } - list_del(&table->list); - nf_tables_table_destroy(&ctx); } + list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { + list_del(&flowtable->list); + nft_use_dec(&table->use); + nf_tables_flowtable_destroy(flowtable); + } + list_for_each_entry_safe(set, ns, &table->sets, list) { + list_del(&set->list); + nft_use_dec(&table->use); + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(&ctx, set); + + nft_set_destroy(&ctx, set); + } + list_for_each_entry_safe(obj, ne, &table->objects, list) { + nft_obj_del(obj); + nft_use_dec(&table->use); + nft_obj_destroy(&ctx, obj); + } + list_for_each_entry_safe(chain, nc, &table->chains, list) { + ctx.chain = chain; + nft_chain_del(chain); + nft_use_dec(&table->use); + nf_tables_chain_destroy(&ctx); + } + list_del(&table->list); + nf_tables_table_destroy(&ctx); +} + +static void __nft_release_tables(struct net *net) +{ + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nft_table *table, *nt; + + list_for_each_entry_safe(table, nt, &nft_net->tables, list) + __nft_release_table(net, table); } static int __net_init nf_tables_init_net(struct net *net) { - INIT_LIST_HEAD(&net->nft.tables); - INIT_LIST_HEAD(&net->nft.commit_list); - INIT_LIST_HEAD(&net->nft.module_list); - mutex_init(&net->nft.commit_mutex); - net->nft.base_seq = 1; - net->nft.validate_state = NFT_VALIDATE_SKIP; + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + INIT_LIST_HEAD(&nft_net->tables); + INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->binding_list); + INIT_LIST_HEAD(&nft_net->module_list); + INIT_LIST_HEAD(&nft_net->notify_list); + mutex_init(&nft_net->commit_mutex); + nft_net->base_seq = 1; + nft_net->validate_state = NFT_VALIDATE_SKIP; + nft_net->gc_seq = 0; return 0; } static void __net_exit nf_tables_pre_exit_net(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + mutex_lock(&nft_net->commit_mutex); __nft_release_hooks(net); + mutex_unlock(&nft_net->commit_mutex); } static void __net_exit nf_tables_exit_net(struct net *net) { - mutex_lock(&net->nft.commit_mutex); - if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + unsigned int gc_seq; + + mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); + __nft_release_tables(net); - mutex_unlock(&net->nft.commit_mutex); - WARN_ON_ONCE(!list_empty(&net->nft.tables)); - WARN_ON_ONCE(!list_empty(&net->nft.module_list)); + + nft_gc_seq_end(nft_net, gc_seq); + + mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); + WARN_ON_ONCE(!list_empty(&nft_net->module_list)); +} + +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); } static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, + .id = &nf_tables_net_id, + .size = sizeof(struct nftables_pernet), }; static int __init nf_tables_module_init(void) @@ -7865,7 +8515,9 @@ static void __exit nf_tables_module_exit(void) unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); + nf_tables_trans_destroy_flush_work(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 2d3bc22c855c..1e691eff1c40 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -7,6 +7,8 @@ #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> +extern unsigned int nf_tables_net_id; + static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; @@ -345,11 +347,12 @@ static int nft_flow_offload_chain(struct nft_chain *chain, int nft_flow_rule_offload_commit(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; int err = 0; u8 policy; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -400,7 +403,7 @@ int nft_flow_rule_offload_commit(struct net *net) break; } - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -419,14 +422,14 @@ int nft_flow_rule_offload_commit(struct net *net) return err; } -static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) +static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, + struct net_device *dev) { struct nft_base_chain *basechain; - struct net *net = dev_net(dev); const struct nft_table *table; struct nft_chain *chain; - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -450,18 +453,20 @@ static void nft_indr_block_cb(struct net_device *dev, flow_indr_block_bind_cb_t *cb, void *cb_priv, enum flow_block_command cmd) { + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + chain = __nft_offload_get_chain(nft_net, dev); if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { struct nft_base_chain *basechain; basechain = nft_base_chain(chain); nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd); } - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); } static void nft_offload_chain_clean(struct nft_chain *chain) @@ -480,17 +485,19 @@ static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_offload_chain_clean(chain); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 87b36da5cd98..0cf3278007ba 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -183,7 +183,6 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) void nft_trace_notify(struct nft_traceinfo *info) { const struct nft_pktinfo *pkt = info->pkt; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; @@ -219,15 +218,11 @@ void nft_trace_notify(struct nft_traceinfo *info) return; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE); - nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family, + NFNETLINK_V0, 0); if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = info->basechain->type->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt)))) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 81c86a156c6c..3cff23c814c3 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -455,7 +455,8 @@ ack: * processed, this avoids that the same error is * reported several times when replaying the batch. */ - if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) { + if (err == -ENOMEM || + nfnl_err_add(&err_list, nlh, err, &extack) < 0) { /* We failed to enqueue an error, reset the * list of errors and send OOM to userspace * pointing to the batch header. @@ -512,8 +513,6 @@ done: goto replay_abort; } } - if (ss->cleanup) - ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); kfree_skb(skb); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 2481470dec36..4b46421c5e17 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -132,21 +132,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_acct *acct) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; u32 old_flags; event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFACCT_NAME, acct->name)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 3d5fc07b2530..cb9c2e955996 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -530,20 +530,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_conntrack_helper *helper) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; int status; event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFCTH_NAME, helper->name)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index da915c224a82..a854e1560cc1 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -160,22 +160,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || @@ -382,21 +377,16 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, const unsigned int *timeouts) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b36af4741ad3..80c09070ea9f 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -452,20 +452,15 @@ __build_packet_message(struct nfnl_log_net *log, { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - nlh = nlmsg_put(inst->skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(inst->skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), + 0, pf, NFNETLINK_V0, htons(inst->group_num)); if (!nlh) return -1; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(inst->group_num); memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; @@ -688,8 +683,8 @@ nfulnl_log_packet(struct net *net, unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; + enum ip_conntrack_info ctinfo = 0; struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 79fbf37291f3..573a372e760f 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb, struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; + bool found = false; memset(&ctx, 0, sizeof(ctx)); @@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb, data->genre = f->genre; data->version = f->version; + found = true; break; } - return true; + return found; } EXPORT_SYMBOL_GPL(nf_osf_find); @@ -314,6 +316,14 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + if (f->opt_num > ARRAY_SIZE(f->opt)) + return -EINVAL; + + if (!memchr(f->genre, 0, MAXGENRELEN) || + !memchr(f->subtype, 0, MAXGENRELEN) || + !memchr(f->version, 0, MAXGENRELEN)) + return -EINVAL; + kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; @@ -438,3 +448,4 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7d3ab08a5a2d..772f8c69818c 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -383,12 +383,11 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); + enum ip_conntrack_info ctinfo; struct nfnl_ct_hook *nfnl_ct; bool csum_verify; char *secdata = NULL; @@ -469,18 +468,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nlmsg_failure; } - nlh = nlmsg_put(skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), + 0, entry->state.pf, NFNETLINK_V0, + htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = entry->state.pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(queue->queue_num); nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); @@ -846,11 +842,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) } static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { + unsigned int min_len = skb_transport_offset(e->skb); + + if (data_len < min_len) + return -EINVAL; + if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { @@ -1183,7 +1184,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct nfqnl_instance *queue; unsigned int verdict; struct nf_queue_entry *entry; - enum ip_conntrack_info uninitialized_var(ctinfo); + enum ip_conntrack_info ctinfo; struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; struct nfnl_queue_net *q = nfnl_queue_pernet(net); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 10e9d50e4e19..ccab2e66d754 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -16,8 +16,8 @@ #include <net/netfilter/nf_tables_offload.h> struct nft_bitwise { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; u8 len; struct nft_data mask; struct nft_data xor; @@ -65,14 +65,14 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + priv->len); if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 12bed3f7bbc6..9d250bd60bb8 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -16,8 +16,8 @@ #include <net/netfilter/nf_tables.h> struct nft_byteorder { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; enum nft_byteorder_ops op:8; u8 len; u8 size; @@ -30,28 +30,29 @@ void nft_byteorder_eval(const struct nft_expr *expr, const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - union { u32 u32; u16 u16; } *s, *d; + u16 *s16, *d16; unsigned int i; - s = (void *)src; - d = (void *)dst; + s16 = (void *)src; + d16 = (void *)dst; switch (priv->size) { case 8: { + u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); - nft_reg_store64(&dst[i], be64_to_cpu(src64)); + nft_reg_store64(&dst64[i], be64_to_cpu(src64)); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); - nft_reg_store64(&dst[i], src64); + nft_reg_store64(&dst64[i], src64); } break; } @@ -61,11 +62,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = ntohl((__force __be32)s[i].u32); + dst[i] = ntohl((__force __be32)src[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = (__force __u32)htonl(s[i].u32); + dst[i] = (__force __u32)htonl(src[i]); break; } break; @@ -73,11 +74,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = ntohs((__force __be16)s[i].u16); + d16[i] = ntohs((__force __be16)s16[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = (__force __u16)htons(s[i].u16); + d16[i] = (__force __u16)htons(s16[i]); break; } break; @@ -131,20 +132,20 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg, + priv->len); if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b5d5d071d765..916195dba6f7 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/netfilter/nf_tables.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> @@ -10,6 +11,8 @@ #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> +extern unsigned int nf_tables_net_id; + #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, @@ -293,6 +296,9 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, if (strcmp(basechain->dev_name, dev->name) != 0) return; + if (!basechain->ops.dev) + return; + /* UNREGISTER events are also happpening on netns exit. * * Altough nf_tables core releases all tables/chains, only @@ -315,6 +321,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; struct nft_table *table; struct nft_chain *chain, *nr; struct nft_ctx ctx = { @@ -325,8 +332,9 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - mutex_lock(&ctx.net->nft.commit_mutex); - list_for_each_entry(table, &ctx.net->nft.tables, list) { + nft_net = net_generic(ctx.net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -340,7 +348,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, nft_netdev_event(event, dev, &ctx); } } - mutex_unlock(&ctx.net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index ae730dba60c8..a7c1e7c4381a 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -17,7 +17,7 @@ struct nft_cmp_expr { struct nft_data data; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_cmp_ops op:8; }; @@ -86,8 +86,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return err; } - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -169,8 +168,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index bbe03b9a03b1..fdce5012a4f3 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -192,6 +192,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; + u32 l4proto; u32 flags; int err; @@ -204,12 +205,18 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); - if (flags & ~NFT_RULE_COMPAT_F_MASK) + if (flags & NFT_RULE_COMPAT_F_UNUSED || + flags & ~NFT_RULE_COMPAT_F_MASK) return -EINVAL; if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; - *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + if (l4proto > U16_MAX) + return -EINVAL; + + *proto = l4proto; + return 0; } @@ -327,6 +334,22 @@ static int nft_target_validate(const struct nft_ctx *ctx, unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -569,6 +592,22 @@ static int nft_match_validate(const struct nft_ctx *ctx, unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -591,19 +630,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int rev, int target) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 28991730728b..0c7f091d7d54 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -27,8 +27,8 @@ struct nft_ct { enum nft_ct_keys key:8; enum ip_conntrack_dir dir:8; union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 dreg; + u8 sreg; }; }; @@ -481,6 +481,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, break; #endif case NFT_CT_ID: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); break; default: @@ -498,9 +501,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, } } - priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, + NFT_DATA_VALUE, len); if (err < 0) return err; @@ -600,8 +602,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, } } - priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); - err = nft_validate_register_load(priv->sreg, len); + err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; @@ -1176,7 +1177,30 @@ static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + switch (priv->l3num) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET) + break; + + return -EINVAL; + case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */ + default: + return -EAFNOSUPPORT; + } + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + switch (priv->l4proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_DCCP: + case IPPROTO_SCTP: + break; + default: + return -EOPNOTSUPP; + } + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 6007089e1c2f..a5b560ee0337 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -14,7 +14,7 @@ #include <net/netfilter/nf_dup_netdev.h> struct nft_dup_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_dup_netdev_eval(const struct nft_expr *expr, @@ -40,8 +40,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 6bcc18124e5b..e0c17217817d 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -11,13 +11,16 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <net/netns/generic.h> + +extern unsigned int nf_tables_net_id; struct nft_dynset { struct nft_set *set; struct nft_set_ext_tmpl tmpl; enum nft_dynset_ops op:8; - enum nft_registers sreg_key:8; - enum nft_registers sreg_data:8; + u8 sreg_key; + u8 sreg_data; bool invert; u64 timeout; struct nft_expr *expr; @@ -129,13 +132,14 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_dynset *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; int err; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || @@ -157,6 +161,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_OBJECT) + return -EOPNOTSUPP; + if (set->ops->update == NULL) return -EOPNOTSUPP; @@ -177,8 +184,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return err; } - priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); - err = nft_validate_register_load(priv->sreg_key, set->klen); + err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, + set->klen); if (err < 0) return err; @@ -188,8 +195,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; - priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]); - err = nft_validate_register_load(priv->sreg_data, set->dlen); + err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA], + &priv->sreg_data, set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) @@ -259,7 +266,7 @@ static void nft_dynset_activate(const struct nft_ctx *ctx, { struct nft_dynset *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_dynset_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index faa0844c01fb..670dd146fb2b 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -19,8 +19,8 @@ struct nft_exthdr { u8 offset; u8 len; u8 op; - enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 dreg; + u8 sreg; u8 flags; }; @@ -353,12 +353,12 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, @@ -403,11 +403,11 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg, + priv->len); } static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index cfac0964f48d..d2777aff5943 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -86,7 +86,6 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); - priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]); switch (priv->result) { case NFT_FIB_RESULT_OIF: @@ -106,8 +105,8 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; } - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + err = nft_parse_register_store(ctx, tb[NFTA_FIB_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); if (err < 0) return err; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index ff5ac173e897..850d4e92702e 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -147,6 +147,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx, { unsigned int hook_mask = (1 << NF_INET_FORWARD); + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, hook_mask); } @@ -171,8 +176,10 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (IS_ERR(flowtable)) return PTR_ERR(flowtable); + if (!nft_use_inc(&flowtable->use)) + return -EMFILE; + priv->flowtable = flowtable; - flowtable->use++; return nf_ct_netns_get(ctx->net, ctx->family); } @@ -191,7 +198,7 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx, { struct nft_flow_offload *priv = nft_expr_priv(expr); - priv->flowtable->use++; + nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 3b0dcd170551..7730409f6f09 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -18,7 +18,7 @@ #include <net/ip.h> struct nft_fwd_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_fwd_netdev_eval(const struct nft_expr *expr, @@ -50,8 +50,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -83,8 +83,8 @@ static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) } struct nft_fwd_neigh { - enum nft_registers sreg_dev:8; - enum nft_registers sreg_addr:8; + u8 sreg_dev; + u8 sreg_addr; u8 nfproto; }; @@ -162,8 +162,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, !tb[NFTA_FWD_NFPROTO]) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]); priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO])); switch (priv->nfproto) { @@ -177,11 +175,13 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_validate_register_load(priv->sreg_dev, sizeof(int)); + err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); if (err < 0) return err; - return nft_validate_register_load(priv->sreg_addr, addr_len); + return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, + addr_len); } static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index b836d550b919..2ff6c7759494 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -14,8 +14,8 @@ #include <linux/jhash.h> struct nft_jhash { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; u8 len; bool autogen_seed:1; u32 modulus; @@ -38,7 +38,7 @@ static void nft_jhash_eval(const struct nft_expr *expr, } struct nft_symhash { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; @@ -83,9 +83,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len); if (err < 0) return err; @@ -94,6 +91,10 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; + err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len); + if (err < 0) + return err; + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -108,9 +109,8 @@ static int nft_jhash_init(const struct nft_ctx *ctx, get_random_bytes(&priv->seed, sizeof(priv->seed)); } - return nft_validate_register_load(priv->sreg, len) && - nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_symhash_init(const struct nft_ctx *ctx, @@ -126,8 +126,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -135,8 +133,9 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + sizeof(u32)); } static int nft_jhash_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 98a8149be094..6a95d532eaec 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -48,9 +48,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, priv->dlen = desc.len; - priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, &priv->data, - desc.type, desc.len); + err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG], + &priv->dreg, &priv->data, desc.type, + desc.len); if (err < 0) goto err1; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 660bad688e2b..e0ffd463a132 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -17,8 +17,8 @@ struct nft_lookup { struct nft_set *set; - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; bool invert; struct nft_set_binding binding; }; @@ -73,8 +73,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg, + set->klen); if (err < 0) return err; @@ -97,9 +97,9 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_MAP)) return -EINVAL; - priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - set->dtype, set->dlen); + err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], + &priv->dreg, NULL, set->dtype, + set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) @@ -129,7 +129,7 @@ static void nft_lookup_activate(const struct nft_ctx *ctx, { struct nft_lookup *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_lookup_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 39dc94f2491e..c2f04885347e 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -15,8 +15,8 @@ struct nft_masq { u32 flags; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { @@ -54,19 +54,15 @@ static int nft_masq_init(const struct nft_ctx *ctx, } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_MASQ_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index dda1e55d5801..ac7d3c78501b 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -247,7 +247,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); + nft_reg_store64((u64 *)dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: nft_reg_store8(dest, nft_meta_weekday(ktime_get_real_seconds())); @@ -380,9 +380,8 @@ int nft_meta_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } EXPORT_SYMBOL_GPL(nft_meta_get_init); @@ -475,8 +474,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); - err = nft_validate_register_load(priv->sreg, len); + err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 0c5bc3c37ecf..69fcea16218b 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -21,10 +21,10 @@ #include <net/ip.h> struct nft_nat { - enum nft_registers sreg_addr_min:8; - enum nft_registers sreg_addr_max:8; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_addr_min; + u8 sreg_addr_max; + u8 sreg_proto_min; + u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; @@ -88,6 +88,11 @@ static int nft_nat_validate(const struct nft_ctx *ctx, struct nft_nat *priv = nft_expr_priv(expr); int err; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; @@ -154,18 +159,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { - priv->sreg_addr_min = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]); - err = nft_validate_register_load(priv->sreg_addr_min, alen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], + &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { - priv->sreg_addr_max = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]); - - err = nft_validate_register_load(priv->sreg_addr_max, - alen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], + &priv->sreg_addr_max, + alen); if (err < 0) return err; } else { @@ -175,19 +177,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 48edb9d5f012..7bbca252e7fc 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -16,7 +16,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); struct nft_ng_inc { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; atomic_t counter; u32 offset; @@ -66,11 +66,10 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); atomic_set(&priv->counter, priv->modulus - 1); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, @@ -100,7 +99,7 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) } struct nft_ng_random { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; @@ -140,10 +139,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, prandom_init_once(&nft_numgen_prandom_state); - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); - - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index bfd18d2b65a2..f5028479c028 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -41,8 +41,10 @@ static int nft_objref_init(const struct nft_ctx *ctx, if (IS_ERR(obj)) return -ENOENT; + if (!nft_use_inc(&obj->use)) + return -EMFILE; + nft_objref_priv(expr) = obj; - obj->use++; return 0; } @@ -71,7 +73,7 @@ static void nft_objref_deactivate(const struct nft_ctx *ctx, if (phase == NFT_TRANS_COMMIT) return; - obj->use--; + nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, @@ -79,7 +81,7 @@ static void nft_objref_activate(const struct nft_ctx *ctx, { struct nft_object *obj = nft_objref_priv(expr); - obj->use++; + nft_use_inc_restore(&obj->use); } static struct nft_expr_type nft_objref_type; @@ -95,7 +97,7 @@ static const struct nft_expr_ops nft_objref_ops = { struct nft_objref_map { struct nft_set *set; - enum nft_registers sreg:8; + u8 sreg; struct nft_set_binding binding; }; @@ -137,8 +139,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; - priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, + set->klen); if (err < 0) return err; @@ -180,7 +182,7 @@ static void nft_objref_map_activate(const struct nft_ctx *ctx, { struct nft_objref_map *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_objref_map_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 4911f8eb394f..b7c2bc01f8a2 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -6,7 +6,7 @@ #include <linux/netfilter/nfnetlink_osf.h> struct nft_osf { - enum nft_registers dreg:8; + u8 dreg; u8 ttl; u32 flags; }; @@ -83,9 +83,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->flags = flags; } - priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); + err = nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, + NFT_OSF_MAXGENRELEN); if (err < 0) return err; @@ -115,9 +115,21 @@ static int nft_osf_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { - return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_FORWARD)); + unsigned int hooks; + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); } static struct nft_expr_type nft_osf_type; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index cf0512fc648e..a4f9a150812a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -84,7 +84,7 @@ void nft_payload_eval(const struct nft_expr *expr, switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: - if (!skb_mac_header_was_set(skb)) + if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0) goto err; if (skb_vlan_tag_present(skb)) { @@ -135,10 +135,10 @@ static int nft_payload_init(const struct nft_ctx *ctx, priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -558,18 +558,23 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_payload_set *priv = nft_expr_priv(expr); + u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; + int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]); if (tb[NFTA_PAYLOAD_CSUM_TYPE]) - priv->csum_type = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); - if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) - priv->csum_offset = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); + if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX, + &csum_offset); + if (err < 0) + return err; + + priv->csum_offset = csum_offset; + } if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) { u32 flags; @@ -580,15 +585,17 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->csum_flags = flags; } - switch (priv->csum_type) { + switch (csum_type) { case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; default: return -EOPNOTSUPP; } + priv->csum_type = csum_type; - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, + priv->len); } static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -624,6 +631,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, { enum nft_payload_bases base; unsigned int offset, len; + int err; if (tb[NFTA_PAYLOAD_BASE] == NULL || tb[NFTA_PAYLOAD_OFFSET] == NULL || @@ -649,8 +657,13 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + if (err < 0) + return ERR_PTR(err); + + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len); + if (err < 0) + return ERR_PTR(err); if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && base != NFT_PAYLOAD_LL_HEADER) diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 5ece0a6aa8c3..94a4f0a5a28e 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -19,10 +19,10 @@ static u32 jhash_initval __read_mostly; struct nft_queue { - enum nft_registers sreg_qnum:8; - u16 queuenum; - u16 queues_total; - u16 flags; + u8 sreg_qnum; + u16 queuenum; + u16 queues_total; + u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -111,8 +111,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, struct nft_queue *priv = nft_expr_priv(expr); int err; - priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]); - err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32)); + err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM], + &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 89efcc5a533d..e4a1c44d7f51 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -15,7 +15,7 @@ struct nft_range_expr { struct nft_data data_from; struct nft_data data_to; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_range_ops op:8; }; @@ -86,8 +86,8 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr goto err2; } - priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]); - err = nft_validate_register_load(priv->sreg, desc_from.len); + err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg, + desc_from.len); if (err < 0) goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 43eeb1f609f1..7179dada5757 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -14,8 +14,8 @@ #include <net/netfilter/nf_tables.h> struct nft_redir { - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; u16 flags; }; @@ -50,24 +50,22 @@ static int nft_redir_init(const struct nft_ctx *ctx, plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } + + priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_REDIR_FLAGS]) { @@ -102,25 +100,37 @@ nla_put_failure: return -1; } -static void nft_redir_ipv4_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_redir_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_ipv4_multi_range_compat mr; + const struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_range2 range; - memset(&mr, 0, sizeof(mr)); + memset(&range, 0, sizeof(range)); + range.flags = priv->flags; if (priv->sreg_proto_min) { - mr.range[0].min.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - mr.range[0].max.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range.min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); } - mr.range[0].flags |= priv->flags; - - regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range, + nft_hook(pkt)); + break; +#ifdef CONFIG_NF_TABLES_IPV6 + case NFPROTO_IPV6: + regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, + nft_hook(pkt)); + break; +#endif + default: + WARN_ON_ONCE(1); + break; + } } static void @@ -133,7 +143,7 @@ static struct nft_expr_type nft_redir_ipv4_type; static const struct nft_expr_ops nft_redir_ipv4_ops = { .type = &nft_redir_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_ipv4_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, @@ -150,28 +160,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { }; #ifdef CONFIG_NF_TABLES_IPV6 -static void nft_redir_ipv6_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_range2 range; - - memset(&range, 0, sizeof(range)); - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - - range.flags |= priv->flags; - - regs->verdict.code = - nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt)); -} - static void nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -182,7 +170,7 @@ static struct nft_expr_type nft_redir_ipv6_type; static const struct nft_expr_ops nft_redir_ipv6_ops = { .type = &nft_redir_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_ipv6_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv6_destroy, .dump = nft_redir_dump, @@ -200,20 +188,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { #endif #ifdef CONFIG_NF_TABLES_INET -static void nft_redir_inet_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - return nft_redir_ipv4_eval(expr, regs, pkt); - case NFPROTO_IPV6: - return nft_redir_ipv6_eval(expr, regs, pkt); - } - - WARN_ON_ONCE(1); -} - static void nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -224,7 +198,7 @@ static struct nft_expr_type nft_redir_inet_type; static const struct nft_expr_ops nft_redir_inet_ops = { .type = &nft_redir_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_inet_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_inet_destroy, .dump = nft_redir_dump, @@ -236,7 +210,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = { .name = "redir", .ops = &nft_redir_inet_ops, .policy = nft_redir_policy, - .maxattr = NFTA_MASQ_MAX, + .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 7cfcb0e2f7ee..f4a96164a5a1 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -15,7 +15,7 @@ struct nft_rt { enum nft_rt_keys key:8; - enum nft_registers dreg:8; + u8 dreg; }; static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst) @@ -141,9 +141,8 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_rt_get_dump(struct sk_buff *skb, @@ -167,6 +166,11 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp const struct nft_rt *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->key) { case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 087a056e34d1..b0f6b1490e1a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -270,13 +270,14 @@ static int nft_bitmap_init(const struct nft_set *set, return 0; } -static void nft_bitmap_destroy(const struct nft_set *set) +static void nft_bitmap_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nft_set_elem_destroy(set, be, true); + nf_tables_set_elem_destroy(ctx, set, be); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index e7eb56b4b89e..0581d5499c5a 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -17,6 +17,9 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <net/netns/generic.h> + +extern unsigned int nf_tables_net_id; /* We target a hash table size of 4, element hint is 75% of final size */ #define NFT_RHASH_ELEMENT_HINT 3 @@ -59,6 +62,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; + if (nft_set_elem_is_dead(&he->ext)) + return 1; if (nft_set_elem_expired(&he->ext)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) @@ -187,7 +192,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); } static bool nft_rhash_flush(const struct net *net, @@ -195,12 +199,9 @@ static bool nft_rhash_flush(const struct net *net, { struct nft_rhash_elem *he = priv; - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); + + return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -217,9 +218,8 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); @@ -251,7 +251,9 @@ static bool nft_rhash_delete(const struct nft_set *set, if (he == NULL) return false; - return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; + nft_set_elem_dead(&he->ext); + + return true; } static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, @@ -277,8 +279,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; @@ -297,49 +297,77 @@ cont: static void nft_rhash_gc(struct work_struct *work) { + struct nftables_pernet *nft_net; struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = net_generic(net, nf_tables_net_id); + gc_seq = READ_ONCE(nft_net->gc_seq); + + if (nft_set_gc_is_pending(set)) + goto done; + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { struct nft_expr *expr = nft_set_ext_expr(&he->ext); if (expr->ops->gc && expr->ops->gc(read_pnet(&set->net), expr)) - goto gc; + goto needs_gc_run; } if (!nft_set_elem_expired(&he->ext)) continue; -gc: - if (nft_set_elem_mark_busy(&he->ext)) - continue; - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); +needs_gc_run: + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, he); } + +try_later: + /* catchall list iteration requires rcu read side lock. */ rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); + +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -374,25 +402,36 @@ static int nft_rhash_init(const struct nft_set *set, return err; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); - if (set->flags & NFT_SET_TIMEOUT) + if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) nft_rhash_gc_init(set); return 0; } +struct nft_rhash_ctx { + const struct nft_ctx ctx; + const struct nft_set *set; +}; + static void nft_rhash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy(arg, ptr, true); + struct nft_rhash_ctx *rhash_ctx = arg; + + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr); } -static void nft_rhash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_ctx rhash_ctx = { + .ctx = *ctx, + .set = set, + }; cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, - (void *)set); + (void *)&rhash_ctx); } /* Number of buckets is stored in u32, so cap our result to 1U<<31 */ @@ -621,7 +660,8 @@ static int nft_hash_init(const struct nft_set *set, return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; @@ -631,7 +671,7 @@ static void nft_hash_destroy(const struct nft_set *set) for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nft_set_elem_destroy(set, he, true); + nf_tables_set_elem_destroy(ctx, set, he); } } } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index ee7c29e0a9d7..2b5f65a424b7 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -14,6 +14,9 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <net/netns/generic.h> + +extern unsigned int nf_tables_net_id; struct nft_rbtree { struct rb_root root; @@ -38,10 +41,18 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, - const struct nft_rbtree_elem *interval) +static int nft_rbtree_cmp(const struct nft_set *set, + const struct nft_rbtree_elem *e1, + const struct nft_rbtree_elem *e2) +{ + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), + set->klen); +} + +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) { - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; + return nft_set_elem_expired(&rbe->ext) || + nft_set_elem_is_dead(&rbe->ext); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -52,7 +63,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); @@ -62,12 +72,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set rbe = rb_entry(parent, struct nft_rbtree_elem, node); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && - nft_rbtree_equal(set, this, interval) && + !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; @@ -80,7 +89,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) + if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { @@ -98,7 +107,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && + !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -214,43 +223,257 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + rb_erase(&rbe->node, &priv->root); +} + +static int nft_rbtree_gc_elem(const struct nft_set *__set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); + struct nft_rbtree_elem *rbe_prev; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) + return -ENOMEM; + + /* search for end interval coming before this element. + * end intervals don't carry a timeout extension, they + * are coupled with the interval start element. + */ + while (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_end(rbe_prev) && + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) + break; + + prev = rb_prev(prev); + } + + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_remove(net, set, priv, rbe_prev); + + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe_prev); + } + + nft_rbtree_gc_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe); + + nft_trans_gc_queue_sync_done(gc); + + return 0; +} + +static bool nft_rbtree_update_first(const struct nft_set *set, + struct nft_rbtree_elem *rbe, + struct rb_node *first) +{ + struct nft_rbtree_elem *first_elem; + + first_elem = rb_entry(first, struct nft_rbtree_elem, node); + /* this element is closest to where the new element is to be inserted: + * update the first element for the node list path. + */ + if (nft_rbtree_cmp(set, rbe, first_elem) < 0) + return true; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - struct nft_rbtree_elem *rbe; - struct rb_node *parent, **p; - int d; + int d, err; + /* Descend the tree to search for an existing element greater than the + * key value to insert that is greater than the new element. This is the + * first element to walk the ordered elements to find possible overlap. + */ parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), - nft_set_ext_key(&new->ext), - set->klen); - if (d < 0) + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) { p = &parent->rb_left; - else if (d > 0) + } else if (d > 0) { + if (!first || + nft_rbtree_update_first(set, rbe, first)) + first = &rbe->node; + p = &parent->rb_right; - else { - if (nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(new)) { + } else { + if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; - } else if (nft_rbtree_interval_start(rbe) && - nft_rbtree_interval_end(new)) { + else p = &parent->rb_right; - } else if (nft_set_elem_active(&rbe->ext, genmask)) { - *ext = &rbe->ext; - return -EEXIST; - } else { - p = &parent->rb_left; + } + } + + if (!first) + first = rb_first(&priv->root); + + /* Detect overlap by going through the list of valid tree nodes. + * Values stored in the tree are in reversed order, starting from + * highest to lowest value. + */ + for (node = first; node != NULL; node = next) { + next = rb_next(node); + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (nft_set_elem_expired(&rbe->ext) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { + err = nft_rbtree_gc_elem(set, priv, rbe); + if (err < 0) + return err; + + continue; + } + + d = nft_rbtree_cmp(set, rbe, new); + if (d == 0) { + /* Matching end element: no need to look for an + * overlapping greater or equal element. + */ + if (nft_rbtree_interval_end(rbe)) { + rbe_le = rbe; + break; + } + + /* first element that is greater or equal to key value. */ + if (!rbe_ge) { + rbe_ge = rbe; + continue; + } + + /* this is a closer more or equal element, update it. */ + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { + rbe_ge = rbe; + continue; + } + + /* element is equal to key value, make sure flags are + * the same, an existing more or equal start element + * must not be replaced by more or equal end element. + */ + if ((nft_rbtree_interval_start(new) && + nft_rbtree_interval_start(rbe_ge)) || + (nft_rbtree_interval_end(new) && + nft_rbtree_interval_end(rbe_ge))) { + rbe_ge = rbe; + continue; } + } else if (d > 0) { + /* annotate element greater than the new element. */ + rbe_ge = rbe; + continue; + } else if (d < 0) { + /* annotate element less than the new element. */ + rbe_le = rbe; + break; } } + + /* - new start element matching existing start element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { + *ext = &rbe_ge->ext; + return -EEXIST; + } + + /* - new end element matching existing end element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + *ext = &rbe_le->ext; + return -EEXIST; + } + + /* - new start element with existing closest, less or equal key value + * being a start element: partial overlap, reported as -ENOTEMPTY. + * Anonymous sets allow for two consecutive start element since they + * are constant, skip them to avoid bogus overlap reports. + */ + if (!nft_set_is_anonymous(set) && rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, less or equal key value + * being a end element: partial overlap, reported as -ENOTEMPTY. + */ + if (rbe_le && + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, greater or equal key value + * being an end element: partial overlap, reported as -ENOTEMPTY + */ + if (rbe_ge && + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* Accepted element: pick insertion point depending on key value */ + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; @@ -294,7 +517,6 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -302,12 +524,9 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); + + return true; } static void *nft_rbtree_deactivate(const struct net *net, @@ -338,6 +557,8 @@ static void *nft_rbtree_deactivate(const struct net *net, nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; + } else if (nft_set_elem_expired(&rbe->ext)) { + break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; continue; @@ -364,8 +585,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&rbe->ext)) - goto cont; if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; @@ -384,58 +603,82 @@ cont: static void nft_rbtree_gc(struct work_struct *work) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; + struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct nftables_pernet *nft_net; struct nft_rbtree *priv; + struct nft_trans_gc *gc; struct rb_node *node; struct nft_set *set; + unsigned int gc_seq; + struct net *net; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = net_generic(net, nf_tables_net_id); + gc_seq = READ_ONCE(nft_net->gc_seq); - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); + if (nft_set_gc_is_pending(set)) + goto done; + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; + + read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (nft_set_elem_is_dead(&rbe->ext)) + goto dead_elem; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is + * always visited before the start element. + */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } + if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) + + nft_set_elem_dead(&rbe->ext); + + if (!rbe_end) continue; - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; + nft_set_elem_dead(&rbe_end->ext); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); - rbe_end = NULL; - } - node = rb_next(node); - if (!node) - break; + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, rbe); } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); - nft_set_gc_batch_complete(gcb); +try_later: + read_unlock_bh(&priv->lock); + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -464,7 +707,8 @@ static int nft_rbtree_init(const struct nft_set *set, return 0; } -static void nft_rbtree_destroy(const struct nft_set *set) +static void nft_rbtree_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; @@ -475,7 +719,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe, true); + nf_tables_set_elem_destroy(ctx, set, rbe); } } diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 4026ec38526f..c7b78e4ef459 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -10,7 +10,7 @@ struct nft_socket { enum nft_socket_keys key:8; union { - enum nft_registers dreg:8; + u8 dreg; }; }; @@ -119,9 +119,8 @@ static int nft_socket_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_socket_dump(struct sk_buff *skb, @@ -140,6 +139,11 @@ static int nft_socket_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 15abb0e49603..2e026f903a4c 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx, break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; @@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: nf_synproxy_ipv4_fini(snet, ctx->net); nf_synproxy_ipv6_fini(snet, ctx->net); break; @@ -253,6 +251,11 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD)); } diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index b97ab1198b03..d9604a316600 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -13,9 +13,9 @@ #endif struct nft_tproxy { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_port:8; - u8 family; + u8 sreg_addr; + u8 sreg_port; + u8 family; }; static void nft_tproxy_eval_v4(const struct nft_expr *expr, @@ -254,15 +254,15 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, } if (tb[NFTA_TPROXY_REG_ADDR]) { - priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, alen); + err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR], + &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { - priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]); - err = nft_validate_register_load(priv->sreg_port, sizeof(u16)); + err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT], + &priv->sreg_port, sizeof(u16)); if (err < 0) return err; } @@ -289,6 +289,18 @@ static int nft_tproxy_dump(struct sk_buff *skb, return 0; } +static int nft_tproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + + return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); +} + static struct nft_expr_type nft_tproxy_type; static const struct nft_expr_ops nft_tproxy_ops = { .type = &nft_tproxy_type, @@ -296,6 +308,7 @@ static const struct nft_expr_ops nft_tproxy_ops = { .eval = nft_tproxy_eval, .init = nft_tproxy_init, .dump = nft_tproxy_dump, + .validate = nft_tproxy_validate, }; static struct nft_expr_type nft_tproxy_type __read_mostly = { diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 1effd4878619..b2070f9f98ff 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -14,7 +14,7 @@ struct nft_tunnel { enum nft_tunnel_keys key:8; - enum nft_registers dreg:8; + u8 dreg; enum nft_tunnel_mode mode:8; }; @@ -92,8 +92,6 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]); - if (tb[NFTA_TUNNEL_MODE]) { priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE])); if (priv->mode > NFT_TUNNEL_MODE_MAX) @@ -102,8 +100,8 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, priv->mode = NFT_TUNNEL_MODE_NONE; } - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_tunnel_get_dump(struct sk_buff *skb, @@ -134,6 +132,7 @@ static const struct nft_expr_ops nft_tunnel_get_ops = { static struct nft_expr_type nft_tunnel_type __read_mostly = { .name = "tunnel", + .family = NFPROTO_NETDEV, .ops = &nft_tunnel_get_ops, .policy = nft_tunnel_policy, .maxattr = NFTA_TUNNEL_MAX, diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 06d5cabf1d7c..7f762fc42891 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -24,7 +24,7 @@ static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { struct nft_xfrm { enum nft_xfrm_keys key:8; - enum nft_registers dreg:8; + u8 dreg; u8 dir; u8 spnum; }; @@ -86,9 +86,8 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx, priv->spnum = spnum; - priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current @@ -234,6 +233,11 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 353ca7801251..ff66b56a3f97 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -/* FIXME: Take multiple ranges --RR */ static int redirect_tg4_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; @@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par) static unsigned int redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par)); + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range2 range = { + .flags = mr->range[0].flags, + .min_proto = mr->range[0].min, + .max_proto = mr->range[0].max, + }; + + return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par)); } static struct xt_target redirect_tg_reg[] __read_mostly = { diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index e1990baf3a3b..dc9485854002 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index e85ce69924ae..50332888c8d2 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -76,18 +76,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - filp = sk->sk_socket->file; - if (filp == NULL) + read_lock_bh(&sk->sk_callback_lock); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + if (filp == NULL) { + read_unlock_bh(&sk->sk_callback_lock); return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; + } if (info->match & XT_OWNER_UID) { kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ - !(info->invert & XT_OWNER_UID)) + !(info->invert & XT_OWNER_UID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } if (info->match & XT_OWNER_GID) { @@ -112,10 +117,13 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } } - if (match ^ !(info->invert & XT_OWNER_GID)) + if (match ^ !(info->invert & XT_OWNER_GID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } + read_unlock_bh(&sk->sk_callback_lock); return true; } diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 3469b6073610..6fc0deb11aff 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -561,7 +561,7 @@ recent_mt_proc_write(struct file *file, const char __user *input, { struct recent_table *t = PDE_DATA(file_inode(file)); struct recent_entry *e; - char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")]; const char *c = buf; union nf_inet_addr addr = {}; u_int16_t family; diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 680015ba7cb6..d4bf089c9e3f 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -150,6 +150,8 @@ static int sctp_mt_check(const struct xt_mtchk_param *par) { const struct xt_sctp_info *info = par->matchinfo; + if (info->flag_count > ARRAY_SIZE(info->flag_info)) + return -EINVAL; if (info->flags & ~XT_SCTP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~XT_SCTP_VALID_FLAGS) diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index 177b40d08098..117d4615d668 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -96,11 +96,32 @@ static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret ^ data->invert; } +static int u32_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_u32 *data = par->matchinfo; + const struct xt_u32_test *ct; + unsigned int i; + + if (data->ntests > ARRAY_SIZE(data->tests)) + return -EINVAL; + + for (i = 0; i < data->ntests; ++i) { + ct = &data->tests[i]; + + if (ct->nnums > ARRAY_SIZE(ct->location) || + ct->nvalues > ARRAY_SIZE(ct->value)) + return -EINVAL; + } + + return 0; +} + static struct xt_match xt_u32_mt_reg __read_mostly = { .name = "u32", .revision = 0, .family = NFPROTO_UNSPEC, .match = u32_mt, + .checkentry = u32_mt_checkentry, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }; diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 249da67d50a2..1bb2a7404dc4 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -54,6 +54,28 @@ static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 }, }; +static const struct netlbl_calipso_ops *calipso_ops; + +/** + * netlbl_calipso_ops_register - Register the CALIPSO operations + * @ops: ops to register + * + * Description: + * Register the CALIPSO packet engine operations. + * + */ +const struct netlbl_calipso_ops * +netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) +{ + return xchg(&calipso_ops, ops); +} +EXPORT_SYMBOL(netlbl_calipso_ops_register); + +static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) +{ + return READ_ONCE(calipso_ops); +} + /* NetLabel Command Handlers */ /** @@ -96,16 +118,19 @@ static int netlbl_calipso_add_pass(struct genl_info *info, * */ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) - { int ret_val = -EINVAL; struct netlbl_audit audit_info; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (!info->attrs[NLBL_CALIPSO_A_DOI] || !info->attrs[NLBL_CALIPSO_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + if (!ops) + return -EOPNOTSUPP; + + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { case CALIPSO_MAP_PASS: ret_val = netlbl_calipso_add_pass(info, &audit_info); @@ -287,7 +312,7 @@ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CALIPSO_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, @@ -362,27 +387,6 @@ int __init netlbl_calipso_genl_init(void) return genl_register_family(&netlbl_calipso_gnl_family); } -static const struct netlbl_calipso_ops *calipso_ops; - -/** - * netlbl_calipso_ops_register - Register the CALIPSO operations - * - * Description: - * Register the CALIPSO packet engine operations. - * - */ -const struct netlbl_calipso_ops * -netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) -{ - return xchg(&calipso_ops, ops); -} -EXPORT_SYMBOL(netlbl_calipso_ops_register); - -static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) -{ - return READ_ONCE(calipso_ops); -} - /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 1778e4e8ce24..4197a9bcaa96 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -410,7 +410,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) !info->attrs[NLBL_CIPSOV4_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) { case CIPSO_V4_MAP_TRANS: ret_val = netlbl_cipsov4_add_std(info, &audit_info); @@ -709,7 +709,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CIPSOV4_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 91b35b7c80d8..96059c99b915 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -857,7 +857,8 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, offset -= iter->startbit; idx = offset / NETLBL_CATMAP_MAPSIZE; - iter->bitmap[idx] |= bitmap << (offset % NETLBL_CATMAP_MAPSIZE); + iter->bitmap[idx] |= (NETLBL_CATMAP_MAPTYPE)bitmap + << (offset % NETLBL_CATMAP_MAPSIZE); return 0; } diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index a92ed37d0922..e2801210467f 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -435,7 +435,7 @@ static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -458,7 +458,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_MGMT_A_DOMAIN]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info); @@ -558,7 +558,7 @@ static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -577,7 +577,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) { struct netlbl_audit audit_info; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info); } diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 7b62cdea6163..f4d9a5c796f8 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -813,7 +813,7 @@ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } @@ -896,7 +896,7 @@ static int netlbl_unlabel_staticadd(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -946,7 +946,7 @@ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -993,7 +993,7 @@ static int netlbl_unlabel_staticremove(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -1033,7 +1033,7 @@ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index 3c67afce64f1..32d8f92c9a20 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -28,11 +28,9 @@ /** * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg - * @skb: the packet * @audit_info: NetLabel audit information */ -static inline void netlbl_netlink_auditinfo(struct sk_buff *skb, - struct netlbl_audit *audit_info) +static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { security_task_getsecid(current, &audit_info->secid); audit_info->loginuid = audit_get_loginuid(current); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 86b70385dce3..3808b12da7f6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -156,7 +156,7 @@ static inline u32 netlink_group_mask(u32 group) static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { - unsigned int len = skb_end_offset(skb); + unsigned int len = skb->len; struct sk_buff *new; new = alloc_skb(len, gfp_mask); @@ -365,7 +365,7 @@ static void netlink_skb_destructor(struct sk_buff *skb) if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) - vfree(skb->head); + vfree_atomic(skb->head); skb->head = NULL; } @@ -569,7 +569,9 @@ static int netlink_insert(struct sock *sk, u32 portid) if (nlk_sk(sk)->bound) goto err; - nlk_sk(sk)->portid = portid; + /* portid can be read locklessly from netlink_getname(). */ + WRITE_ONCE(nlk_sk(sk)->portid, portid); + sock_hold(sk); err = __netlink_insert(table, sk); @@ -1018,7 +1020,6 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; } - netlink_lock_table(); if (nlk->netlink_bind && groups) { int group; @@ -1030,13 +1031,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (!err) continue; netlink_undo_bind(group, groups, sk); - goto unlock; + return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ + netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : @@ -1078,9 +1080,11 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; if (addr->sa_family == AF_UNSPEC) { - sk->sk_state = NETLINK_UNCONNECTED; - nlk->dst_portid = 0; - nlk->dst_group = 0; + /* paired with READ_ONCE() in netlink_getsockbyportid() */ + WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); + /* dst_portid and dst_group can be read locklessly */ + WRITE_ONCE(nlk->dst_portid, 0); + WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) @@ -1101,9 +1105,11 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, err = netlink_autobind(sock); if (err == 0) { - sk->sk_state = NETLINK_CONNECTED; - nlk->dst_portid = nladdr->nl_pid; - nlk->dst_group = ffs(nladdr->nl_groups); + /* paired with READ_ONCE() in netlink_getsockbyportid() */ + WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); + /* dst_portid and dst_group can be read locklessly */ + WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); + WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; @@ -1120,10 +1126,12 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_pad = 0; if (peer) { - nladdr->nl_pid = nlk->dst_portid; - nladdr->nl_groups = netlink_group_mask(nlk->dst_group); + /* Paired with WRITE_ONCE() in netlink_connect() */ + nladdr->nl_pid = READ_ONCE(nlk->dst_portid); + nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { - nladdr->nl_pid = nlk->portid; + /* Paired with WRITE_ONCE() in netlink_insert() */ + nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); @@ -1150,8 +1158,9 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); - if (sock->sk_state == NETLINK_CONNECTED && - nlk->dst_portid != nlk_sk(ssk)->portid) { + /* dst_portid and sk_state can be changed in netlink_connect() */ + if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && + READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } @@ -1592,6 +1601,7 @@ out: int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; + unsigned long flags; struct sock *sk; int ret = 0; @@ -1601,12 +1611,12 @@ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) /* sk->sk_err wants a positive error value */ info.code = -code; - read_lock(&nl_table_lock); + read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); - read_unlock(&nl_table_lock); + read_unlock_irqrestore(&nl_table_lock, flags); return ret; } EXPORT_SYMBOL(netlink_set_err); @@ -1734,7 +1744,8 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - int len, val, err; + unsigned int flag; + int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; @@ -1746,39 +1757,17 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { - int pos, idx, shift; + int pos, idx, shift, err = 0; - err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) @@ -1792,43 +1781,35 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, break; } } - if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) + if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); - break; + return err; } case NETLINK_CAP_ACK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0; - if (put_user(len, optlen) || put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0; - if (put_user(len, optlen) || put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_STRICT_CHK; break; default: - err = -ENOPROTOOPT; + return -ENOPROTOOPT; } - return err; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = nlk->flags & flag ? 1 : 0; + + if (put_user(len, optlen) || + copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) @@ -1887,8 +1868,9 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { - dst_portid = nlk->dst_portid; - dst_group = nlk->dst_group; + /* Paired with WRITE_ONCE() in netlink_connect() */ + dst_portid = READ_ONCE(nlk->dst_portid); + dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ @@ -2010,7 +1992,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, skb_free_datagram(sk, skb); - if (nlk->cb_running && + if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { @@ -2303,7 +2285,7 @@ static int netlink_dump(struct sock *sk) if (cb->done) cb->done(cb); - nlk->cb_running = false; + WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(nlk->cb_mutex); @@ -2366,7 +2348,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, goto error_put; } - nlk->cb_running = true; + WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); @@ -2655,7 +2637,7 @@ static int netlink_seq_show(struct seq_file *seq, void *v) nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - nlk->cb_running, + READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) diff --git a/net/netlink/diag.c b/net/netlink/diag.c index c6255eac305c..e4f21b1067bc 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -94,6 +94,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; + unsigned long flags; struct sock *sk; int num = 2; int ret = 0; @@ -152,7 +153,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, num++; mc_list: - read_lock(&nl_table_lock); + read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; @@ -167,13 +168,13 @@ mc_list: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - sock_i_ino(sk)) < 0) { + __sock_i_ino(sk)) < 0) { ret = 1; break; } num++; } - read_unlock(&nl_table_lock); + read_unlock_irqrestore(&nl_table_lock, flags); done: cb->args[0] = num; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 102b8d6b5612..a03e16e06e29 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -989,11 +989,46 @@ static struct genl_family genl_ctrl __ro_after_init = { .netnsok = true, }; +static int genl_bind(struct net *net, int group) +{ + const struct genl_family *family; + unsigned int id; + int ret = 0; + + genl_lock_all(); + + idr_for_each_entry(&genl_fam_idr, family, id) { + const struct genl_multicast_group *grp; + int i; + + if (family->n_mcgrps == 0) + continue; + + i = group - family->mcgrp_offset; + if (i < 0 || i >= family->n_mcgrps) + continue; + + grp = &family->mcgrps[i]; + if ((grp->flags & GENL_UNS_ADMIN_PERM) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) + ret = -EPERM; + if (grp->cap_sys_admin && + !ns_capable(net->user_ns, CAP_SYS_ADMIN)) + ret = -EPERM; + + break; + } + + genl_unlock_all(); + return ret; +} + static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, + .bind = genl_bind, }; /* we'll bump the group number right afterwards */ diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 58d5373c513c..abb69c149644 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -378,6 +378,11 @@ static int nr_listen(struct socket *sock, int backlog) struct sock *sk = sock->sk; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; @@ -426,16 +431,16 @@ static int nr_create(struct net *net, struct socket *sock, int protocol, nr_init_timers(sk); nr->t1 = - msecs_to_jiffies(sysctl_netrom_transport_timeout); + msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_timeout)); nr->t2 = - msecs_to_jiffies(sysctl_netrom_transport_acknowledge_delay); + msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_acknowledge_delay)); nr->n2 = - msecs_to_jiffies(sysctl_netrom_transport_maximum_tries); + msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_maximum_tries)); nr->t4 = - msecs_to_jiffies(sysctl_netrom_transport_busy_delay); + msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_busy_delay)); nr->idle = - msecs_to_jiffies(sysctl_netrom_transport_no_activity_timeout); - nr->window = sysctl_netrom_transport_requested_window_size; + msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_no_activity_timeout)); + nr->window = READ_ONCE(sysctl_netrom_transport_requested_window_size); nr->bpqext = 1; nr->state = NR_STATE_0; @@ -633,6 +638,11 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, goto out_release; } + if (sock->state == SS_CONNECTING) { + err = -EALREADY; + goto out_release; + } + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; @@ -922,7 +932,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) * G8PZT's Xrouter which is sending packets with command type 7 * as an extension of the protocol. */ - if (sysctl_netrom_reset_circuit && + if (READ_ONCE(sysctl_netrom_reset_circuit) && (frametype != NR_RESET || flags != 0)) nr_transmit_reset(skb, 1); diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 29e418c8c6c3..4caee8754b79 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -81,7 +81,7 @@ static int nr_header(struct sk_buff *skb, struct net_device *dev, buff[6] |= AX25_SSSID_SPARE; buff += AX25_ADDR_LEN; - *buff++ = sysctl_netrom_network_ttl_initialiser; + *buff++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); *buff++ = NR_PROTO_IP; *buff++ = NR_PROTO_IP; diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 2bef3779f893..8cbb57678a9e 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -97,7 +97,7 @@ static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit) + if (READ_ONCE(sysctl_netrom_reset_circuit)) nr_disconnect(sk, ECONNRESET); break; @@ -128,7 +128,7 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit) + if (READ_ONCE(sysctl_netrom_reset_circuit)) nr_disconnect(sk, ECONNRESET); break; @@ -263,7 +263,7 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype break; case NR_RESET: - if (sysctl_netrom_reset_circuit) + if (READ_ONCE(sysctl_netrom_reset_circuit)) nr_disconnect(sk, ECONNRESET); break; diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c index 44929657f5b7..5e531394a724 100644 --- a/net/netrom/nr_out.c +++ b/net/netrom/nr_out.c @@ -204,7 +204,7 @@ void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb) dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; - *dptr++ = sysctl_netrom_network_ttl_initialiser; + *dptr++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); if (!nr_route_frame(skb, NULL)) { kfree_skb(skb); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 89cd9de21594..37cfa880c2d0 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -153,7 +153,7 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, nr_neigh->digipeat = NULL; nr_neigh->ax25 = NULL; nr_neigh->dev = dev; - nr_neigh->quality = sysctl_netrom_default_path_quality; + nr_neigh->quality = READ_ONCE(sysctl_netrom_default_path_quality); nr_neigh->locked = 0; nr_neigh->count = 0; nr_neigh->number = nr_neigh_no++; @@ -725,7 +725,7 @@ void nr_link_failed(ax25_cb *ax25, int reason) nr_neigh->ax25 = NULL; ax25_cb_put(ax25); - if (++nr_neigh->failed < sysctl_netrom_link_fails_count) { + if (++nr_neigh->failed < READ_ONCE(sysctl_netrom_link_fails_count)) { nr_neigh_put(nr_neigh); return; } @@ -763,7 +763,7 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) if (ax25 != NULL) { ret = nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, ax25->ax25_dev->dev, 0, - sysctl_netrom_obsolescence_count_initialiser); + READ_ONCE(sysctl_netrom_obsolescence_count_initialiser)); if (ret) return ret; } @@ -777,7 +777,7 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) return ret; } - if (!sysctl_netrom_routing_control && ax25 != NULL) + if (!READ_ONCE(sysctl_netrom_routing_control) && ax25 != NULL) return 0; /* Its Time-To-Live has expired */ diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 3f99b432ea70..c3bbd5880850 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -123,7 +123,7 @@ void nr_write_internal(struct sock *sk, int frametype) unsigned char *dptr; int len, timeout; - len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; + len = NR_TRANSPORT_LEN; switch (frametype & 0x0F) { case NR_CONNREQ: @@ -141,7 +141,8 @@ void nr_write_internal(struct sock *sk, int frametype) return; } - if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + skb = alloc_skb(NR_NETWORK_LEN + len, GFP_ATOMIC); + if (!skb) return; /* @@ -149,7 +150,7 @@ void nr_write_internal(struct sock *sk, int frametype) */ skb_reserve(skb, NR_NETWORK_LEN); - dptr = skb_put(skb, skb_tailroom(skb)); + dptr = skb_put(skb, len); switch (frametype & 0x0F) { case NR_CONNREQ: @@ -181,7 +182,8 @@ void nr_write_internal(struct sock *sk, int frametype) *dptr++ = nr->my_id; *dptr++ = frametype; *dptr++ = nr->window; - if (nr->bpqext) *dptr++ = sysctl_netrom_network_ttl_initialiser; + if (nr->bpqext) + *dptr++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); break; case NR_DISCREQ: @@ -235,7 +237,7 @@ void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags) dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; - *dptr++ = sysctl_netrom_network_ttl_initialiser; + *dptr++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); if (mine) { *dptr++ = 0; diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index a8da88db7893..4e7c968cde2d 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -121,6 +121,7 @@ static void nr_heartbeat_expiry(struct timer_list *t) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); goto out; diff --git a/net/nfc/core.c b/net/nfc/core.c index 2d4729d1f0eb..fef112fb4993 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -634,7 +634,7 @@ error: return rc; } -int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); @@ -663,7 +663,7 @@ int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, - u8 *gb, size_t gb_len) + const u8 *gb, size_t gb_len) { int rc; diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 0eb4ddc056e7..02909e3e91ef 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -123,7 +123,7 @@ static bool llc_shdlc_x_lteq_y_lt_z(int x, int y, int z) return ((y >= x) || (y < z)) ? true : false; } -static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, +static struct sk_buff *llc_shdlc_alloc_skb(const struct llc_shdlc *shdlc, int payload_len) { struct sk_buff *skb; @@ -137,7 +137,7 @@ static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, } /* immediately sends an S frame. */ -static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_s_frame(const struct llc_shdlc *shdlc, enum sframe_type sframe_type, int nr) { int r; @@ -159,7 +159,7 @@ static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, } /* immediately sends an U frame. skb may contain optional payload */ -static int llc_shdlc_send_u_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_u_frame(const struct llc_shdlc *shdlc, struct sk_buff *skb, enum uframe_modifier uframe_modifier) { @@ -361,7 +361,7 @@ static void llc_shdlc_connect_complete(struct llc_shdlc *shdlc, int r) wake_up(shdlc->connect_wq); } -static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_initiate(const struct llc_shdlc *shdlc) { struct sk_buff *skb; @@ -377,7 +377,7 @@ static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) return llc_shdlc_send_u_frame(shdlc, skb, U_FRAME_RSET); } -static int llc_shdlc_connect_send_ua(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_send_ua(const struct llc_shdlc *shdlc) { struct sk_buff *skb; diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index 97853c9cefc7..a81893bc06ce 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -202,7 +202,6 @@ void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *s); void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *s); void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock); struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev); -struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local); int nfc_llcp_local_put(struct nfc_llcp_local *local); u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, struct nfc_llcp_sock *sock); @@ -221,15 +220,15 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *sk, struct socket *newsock); /* TLV API */ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); /* Commands API */ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err); -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length); +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length); struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap); -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len); void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp); void nfc_llcp_free_sdp_tlv_list(struct hlist_head *sdp_head); diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 475061c79c44..5b8754ae7d3a 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -15,7 +15,7 @@ #include "nfc.h" #include "llcp.h" -static u8 llcp_tlv_length[LLCP_TLV_MAX] = { +static const u8 llcp_tlv_length[LLCP_TLV_MAX] = { 0, 1, /* VERSION */ 2, /* MIUX */ @@ -29,7 +29,7 @@ static u8 llcp_tlv_length[LLCP_TLV_MAX] = { }; -static u8 llcp_tlv8(u8 *tlv, u8 type) +static u8 llcp_tlv8(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -37,7 +37,7 @@ static u8 llcp_tlv8(u8 *tlv, u8 type) return tlv[2]; } -static u16 llcp_tlv16(u8 *tlv, u8 type) +static u16 llcp_tlv16(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -46,37 +46,37 @@ static u16 llcp_tlv16(u8 *tlv, u8 type) } -static u8 llcp_tlv_version(u8 *tlv) +static u8 llcp_tlv_version(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_VERSION); } -static u16 llcp_tlv_miux(u8 *tlv) +static u16 llcp_tlv_miux(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff; } -static u16 llcp_tlv_wks(u8 *tlv) +static u16 llcp_tlv_wks(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_WKS); } -static u16 llcp_tlv_lto(u8 *tlv) +static u16 llcp_tlv_lto(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_LTO); } -static u8 llcp_tlv_opt(u8 *tlv) +static u8 llcp_tlv_opt(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_OPT); } -static u8 llcp_tlv_rw(u8 *tlv) +static u8 llcp_tlv_rw(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf; } -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length) +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length) { u8 *tlv, length; @@ -130,7 +130,7 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) return sdres; } -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len) { struct nfc_llcp_sdp_tlv *sdreq; @@ -190,9 +190,10 @@ void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head) } int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -239,9 +240,10 @@ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, } int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -295,7 +297,7 @@ static struct sk_buff *llcp_add_header(struct sk_buff *pdu, return pdu; } -static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, u8 *tlv, +static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, const u8 *tlv, u8 tlv_length) { /* XXX Add an skb length check */ @@ -359,6 +361,7 @@ int nfc_llcp_send_symm(struct nfc_dev *dev) struct sk_buff *skb; struct nfc_llcp_local *local; u16 size = 0; + int err; pr_debug("Sending SYMM\n"); @@ -370,8 +373,10 @@ int nfc_llcp_send_symm(struct nfc_dev *dev) size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = alloc_skb(size, GFP_KERNEL); - if (skb == NULL) - return -ENOMEM; + if (skb == NULL) { + err = -ENOMEM; + goto out; + } skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); @@ -381,17 +386,22 @@ int nfc_llcp_send_symm(struct nfc_dev *dev) nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); - return nfc_data_exchange(dev, local->target_idx, skb, + err = nfc_data_exchange(dev, local->target_idx, skb, nfc_llcp_recv, local); +out: + nfc_llcp_local_put(local); + return err; } int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *service_name_tlv = NULL, service_name_tlv_length; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *service_name_tlv = NULL; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 service_name_tlv_length = 0; + u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; @@ -465,8 +475,9 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index cc997518f79d..da3cb0d29b97 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -17,6 +17,8 @@ static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; static LIST_HEAD(llcp_devices); +/* Protects llcp_devices list */ +static DEFINE_SPINLOCK(llcp_devices_lock); static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); @@ -143,8 +145,15 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, write_unlock(&local->raw_sockets.lock); } -struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) +static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { + /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever + * we hold a reference to local, we also need to hold a reference to + * the device to avoid UAF. + */ + if (!nfc_get_device(local->dev->idx)) + return NULL; + kref_get(&local->ref); return local; @@ -159,6 +168,7 @@ static void local_cleanup(struct nfc_llcp_local *local) cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); + local->rx_pending = NULL; del_timer_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); @@ -170,17 +180,24 @@ static void local_release(struct kref *ref) local = container_of(ref, struct nfc_llcp_local, ref); - list_del(&local->list); local_cleanup(local); kfree(local); } int nfc_llcp_local_put(struct nfc_llcp_local *local) { + struct nfc_dev *dev; + int ret; + if (local == NULL) return 0; - return kref_put(&local->ref, local_release); + dev = local->dev; + + ret = kref_put(&local->ref, local_release); + nfc_put_device(dev); + + return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, @@ -203,17 +220,13 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { llcp_sock = tmp_sock; + sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); - if (llcp_sock == NULL) - return NULL; - - sock_hold(&llcp_sock->sk); - return llcp_sock; } @@ -283,12 +296,33 @@ static void nfc_llcp_sdreq_timer(struct timer_list *t) struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { struct nfc_llcp_local *local; + struct nfc_llcp_local *res = NULL; + spin_lock(&llcp_devices_lock); list_for_each_entry(local, &llcp_devices, list) - if (local->dev == dev) + if (local->dev == dev) { + res = nfc_llcp_local_get(local); + break; + } + spin_unlock(&llcp_devices_lock); + + return res; +} + +static struct nfc_llcp_local *nfc_llcp_remove_local(struct nfc_dev *dev) +{ + struct nfc_llcp_local *local, *tmp; + + spin_lock(&llcp_devices_lock); + list_for_each_entry_safe(local, tmp, &llcp_devices, list) + if (local->dev == dev) { + list_del(&local->list); + spin_unlock(&llcp_devices_lock); return local; + } + spin_unlock(&llcp_devices_lock); - pr_debug("No device found\n"); + pr_warn("Shutting down device not found\n"); return NULL; } @@ -301,7 +335,7 @@ static char *wks[] = { "urn:nfc:sn:snep", }; -static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) +static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; @@ -325,7 +359,8 @@ static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len, + bool needref) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; @@ -361,6 +396,8 @@ struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) { llcp_sock = tmp_sock; + if (needref) + sock_hold(&llcp_sock->sk); break; } } @@ -402,7 +439,8 @@ u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, * to this service name. */ if (nfc_llcp_sock_from_sn(local, sock->service_name, - sock->service_name_len) != NULL) { + sock->service_name_len, + false) != NULL) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; @@ -522,7 +560,7 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; - u8 *version_tlv = NULL, *lto_tlv = NULL, + const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; @@ -609,12 +647,15 @@ u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) *general_bytes_len = local->gb_len; + nfc_llcp_local_put(local); + return local->gb; } -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; + int err; if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN) return -EINVAL; @@ -631,35 +672,39 @@ int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) if (memcmp(local->remote_gb, llcp_magic, 3)) { pr_err("MAC does not support LLCP\n"); - return -EINVAL; + err = -EINVAL; + goto out; } - return nfc_llcp_parse_gb_tlv(local, + err = nfc_llcp_parse_gb_tlv(local, &local->remote_gb[3], local->remote_gb_len - 3); +out: + nfc_llcp_local_put(local); + return err; } -static u8 nfc_llcp_dsap(struct sk_buff *pdu) +static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } -static u8 nfc_llcp_ptype(struct sk_buff *pdu) +static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } -static u8 nfc_llcp_ssap(struct sk_buff *pdu) +static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } -static u8 nfc_llcp_ns(struct sk_buff *pdu) +static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } -static u8 nfc_llcp_nr(struct sk_buff *pdu) +static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } @@ -801,23 +846,15 @@ out: } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len) { - struct nfc_llcp_sock *llcp_sock; - - llcp_sock = nfc_llcp_sock_from_sn(local, sn, sn_len); - - if (llcp_sock == NULL) - return NULL; - - sock_hold(&llcp_sock->sk); - - return llcp_sock; + return nfc_llcp_sock_from_sn(local, sn, sn_len, true); } -static u8 *nfc_llcp_connect_sn(struct sk_buff *skb, size_t *sn_len) +static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { - u8 *tlv = &skb->data[2], type, length; + u8 type, length; + const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { @@ -875,7 +912,7 @@ static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; @@ -893,7 +930,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, goto fail; } } else { - u8 *sn; + const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); @@ -946,8 +983,17 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, } new_sock = nfc_llcp_sock(new_sk); - new_sock->dev = local->dev; + new_sock->local = nfc_llcp_local_get(local); + if (!new_sock->local) { + reason = LLCP_DM_REJ; + sock_put(&new_sock->sk); + release_sock(&sock->sk); + sock_put(&sock->sk); + goto fail; + } + + new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; @@ -1112,7 +1158,7 @@ static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1155,7 +1201,8 @@ static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1188,7 +1235,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1226,12 +1274,13 @@ static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; - u8 dsap, ssap, *tlv, type, length, tid, sap; + u8 dsap, ssap, type, length, tid, sap; + const u8 *tlv; u16 tlv_len, offset; - char *service_name; + const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); @@ -1273,7 +1322,8 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, } llcp_sock = nfc_llcp_sock_from_sn(local, service_name, - service_name_len); + service_name_len, + true); if (!llcp_sock) { sap = 0; goto add_snl; @@ -1293,6 +1343,7 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, if (sap == LLCP_SAP_MAX) { sap = 0; + nfc_llcp_sock_put(llcp_sock); goto add_snl; } @@ -1310,6 +1361,7 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, pr_debug("%p %d\n", llcp_sock, sap); + nfc_llcp_sock_put(llcp_sock); add_snl: sdp = nfc_llcp_build_sdres_tlv(tid, sap); if (sdp == NULL) @@ -1522,6 +1574,8 @@ int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb) __nfc_llcp_recv(local, skb); + nfc_llcp_local_put(local); + return 0; } @@ -1538,6 +1592,8 @@ void nfc_llcp_mac_is_down(struct nfc_dev *dev) /* Close and purge all existing sockets */ nfc_llcp_socket_release(local, true, 0); + + nfc_llcp_local_put(local); } void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, @@ -1563,6 +1619,8 @@ void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(local->remote_lto)); } + + nfc_llcp_local_put(local); } int nfc_llcp_register_device(struct nfc_dev *ndev) @@ -1573,7 +1631,16 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) if (local == NULL) return -ENOMEM; - local->dev = ndev; + /* As we are going to initialize local's refcount, we need to get the + * nfc_dev to avoid UAF, otherwise there is no point in continuing. + * See nfc_llcp_local_get(). + */ + local->dev = nfc_get_device(ndev->idx); + if (!local->dev) { + kfree(local); + return -ENODEV; + } + INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); @@ -1606,14 +1673,16 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); + spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); + spin_unlock(&llcp_devices_lock); return 0; } void nfc_llcp_unregister_device(struct nfc_dev *dev) { - struct nfc_llcp_local *local = nfc_llcp_find_local(dev); + struct nfc_llcp_local *local = nfc_llcp_remove_local(dev); if (local == NULL) { pr_debug("No such device\n"); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index bd2174699af9..aea337d81702 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -99,7 +99,7 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) } llcp_sock->dev = dev; - llcp_sock->local = nfc_llcp_local_get(local); + llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, llcp_addr.service_name_len, @@ -181,7 +181,7 @@ static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr, } llcp_sock->dev = dev; - llcp_sock->local = nfc_llcp_local_get(local); + llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; nfc_llcp_sock_link(&local->raw_sockets, sk); @@ -698,24 +698,22 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, if (dev->dep_link_up == false) { ret = -ENOLINK; device_unlock(&dev->dev); - goto put_dev; + goto sock_llcp_put_local; } device_unlock(&dev->dev); if (local->rf_mode == NFC_RF_INITIATOR && addr->target_idx != local->target_idx) { ret = -ENOLINK; - goto put_dev; + goto sock_llcp_put_local; } llcp_sock->dev = dev; - llcp_sock->local = nfc_llcp_local_get(local); + llcp_sock->local = local; llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { - nfc_llcp_local_put(llcp_sock->local); - llcp_sock->local = NULL; ret = -ENOMEM; - goto put_dev; + goto sock_llcp_nullify; } llcp_sock->reserved_ssap = llcp_sock->ssap; @@ -760,8 +758,13 @@ sock_unlink: sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); - nfc_llcp_local_put(llcp_sock->local); + +sock_llcp_nullify: llcp_sock->local = NULL; + llcp_sock->dev = NULL; + +sock_llcp_put_local: + nfc_llcp_local_put(local); put_dev: nfc_put_device(dev); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index b2e922fcc70d..701c3752bda0 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -530,7 +530,7 @@ static int nci_open_device(struct nci_dev *ndev) skb_queue_purge(&ndev->tx_q); ndev->ops->close(ndev); - ndev->flags = 0; + ndev->flags &= BIT(NCI_UNREG); } done: @@ -894,6 +894,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, return -EINVAL; } + if (protocol >= NFC_PROTO_MAX) { + pr_err("the requested nfc protocol is invalid\n"); + return -EINVAL; + } + if (!(nci_target->supported_protocols & (1 << protocol))) { pr_err("target does not support the requested protocol 0x%x\n", protocol); @@ -1192,6 +1197,10 @@ void nci_free_device(struct nci_dev *ndev) { nfc_free_device(ndev->nfc_dev); nci_hci_deallocate(ndev); + + /* drop partial rx data packet if present */ + if (ndev->rx_data_reassembly) + kfree_skb(ndev->rx_data_reassembly); kfree(ndev); } EXPORT_SYMBOL(nci_free_device); @@ -1490,6 +1499,11 @@ static void nci_rx_work(struct work_struct *work) nfc_send_to_raw_sock(ndev->nfc_dev, skb, RAW_PAYLOAD_NCI, NFC_DIRECTION_RX); + if (!nci_plen(skb->data)) { + kfree_skb(skb); + break; + } + /* Process frame */ switch (nci_mt(skb->data)) { case NCI_MT_RSP_PKT: diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index b002e18f38c8..b4548d887489 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -279,8 +279,10 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) nci_plen(skb->data)); conn_info = nci_get_conn_info_by_conn_id(ndev, nci_conn_id(skb->data)); - if (!conn_info) + if (!conn_info) { + kfree_skb(skb); return; + } /* strip the nci data header */ skb_pull(skb, NCI_DATA_HDR_SIZE); diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 33e1170817f0..f8b20cddd5c9 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -218,6 +218,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, target->sens_res = nfca_poll->sens_res; target->sel_res = nfca_poll->sel_res; target->nfcid1_len = nfca_poll->nfcid1_len; + if (target->nfcid1_len > ARRAY_SIZE(target->nfcid1)) + return -EPROTO; if (target->nfcid1_len > 0) { memcpy(target->nfcid1, nfca_poll->nfcid1, target->nfcid1_len); @@ -226,6 +228,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, nfcb_poll = (struct rf_tech_specific_params_nfcb_poll *)params; target->sensb_res_len = nfcb_poll->sensb_res_len; + if (target->sensb_res_len > ARRAY_SIZE(target->sensb_res)) + return -EPROTO; if (target->sensb_res_len > 0) { memcpy(target->sensb_res, nfcb_poll->sensb_res, target->sensb_res_len); @@ -234,6 +238,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, nfcf_poll = (struct rf_tech_specific_params_nfcf_poll *)params; target->sensf_res_len = nfcf_poll->sensf_res_len; + if (target->sensf_res_len > ARRAY_SIZE(target->sensf_res)) + return -EPROTO; if (target->sensf_res_len > 0) { memcpy(target->sensf_res, nfcf_poll->sensf_res, target->sensf_res_len); diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index 9dd8a1096916..96f071792a04 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -150,6 +150,8 @@ static int send_acknowledge(struct nci_spi *nspi, u8 acknowledge) int ret; skb = nci_skb_alloc(nspi->ndev, 0, GFP_KERNEL); + if (!skb) + return -ENOMEM; /* add the NCI SPI header to the start of the buffer */ hdr = skb_push(skb, NCI_SPI_HDR_LEN); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 9e94f732e717..5b55466fe315 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1047,11 +1047,14 @@ static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; - goto exit; + goto put_local; } rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq); +put_local: + nfc_llcp_local_put(local); + exit: device_unlock(&dev->dev); @@ -1113,7 +1116,7 @@ static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) { if (dev->dep_link_up) { rc = -EINPROGRESS; - goto exit; + goto put_local; } local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]); @@ -1125,6 +1128,9 @@ static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) local->miux = cpu_to_be16(miux); +put_local: + nfc_llcp_local_put(local); + exit: device_unlock(&dev->dev); @@ -1180,7 +1186,7 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) if (rc != 0) { rc = -EINVAL; - goto exit; + goto put_local; } if (!sdp_attrs[NFC_SDP_ATTR_URI]) @@ -1199,7 +1205,7 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); if (sdreq == NULL) { rc = -ENOMEM; - goto exit; + goto put_local; } tlvs_len += sdreq->tlv_len; @@ -1209,10 +1215,14 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) if (hlist_empty(&sdreq_list)) { rc = -EINVAL; - goto exit; + goto put_local; } rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); + +put_local: + nfc_llcp_local_put(local); + exit: device_unlock(&dev->dev); @@ -1450,8 +1460,12 @@ static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); + device_unlock(&dev->dev); + return rc; + error: device_unlock(&dev->dev); + kfree(cb_context); return rc; } @@ -1505,6 +1519,7 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; + int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || @@ -1518,25 +1533,37 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) if (!dev) return -ENODEV; - if (!dev->ops || !dev->ops->se_io) - return -ENOTSUPP; + if (!dev->ops || !dev->ops->se_io) { + rc = -EOPNOTSUPP; + goto put_dev; + } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); - if (apdu_len == 0) - return -EINVAL; + if (apdu_len == 0) { + rc = -EINVAL; + goto put_dev; + } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); - if (!apdu) - return -EINVAL; + if (!apdu) { + rc = -EINVAL; + goto put_dev; + } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + if (!ctx) { + rc = -ENOMEM; + goto put_dev; + } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; - return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + +put_dev: + nfc_put_device(dev); + return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, @@ -1559,14 +1586,21 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); - if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) + if (!dev) return -ENODEV; + if (!dev->vendor_cmds || !dev->n_vendor_cmds) { + err = -ENODEV; + goto put_dev; + } + if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); - if (data_len == 0) - return -EINVAL; + if (data_len == 0) { + err = -EINVAL; + goto put_dev; + } } else { data = NULL; data_len = 0; @@ -1581,10 +1615,14 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; - return err; + goto put_dev; } - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + +put_dev: + nfc_put_device(dev); + return err; } /* message building helper */ diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 889fefd64e56..0b1e6466f4fb 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -48,10 +48,11 @@ void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode); int nfc_llcp_register_device(struct nfc_dev *dev); void nfc_llcp_unregister_device(struct nfc_dev *dev); -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len); +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len); u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len); int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb); struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev); +int nfc_llcp_local_put(struct nfc_llcp_local *local); int __init nfc_llcp_init(void); void nfc_llcp_exit(void); void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp); diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index e9ca007718b7..0f23e5e8e03e 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -77,13 +77,12 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; unsigned int nsh_len, mac_len; __be16 proto; - int nhoff; skb_reset_network_header(skb); - nhoff = skb->network_header - skb->mac_header; mac_len = skb->mac_len; if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) @@ -108,15 +107,14 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, htons(ETH_P_NSH), nsh_len, - skb->network_header - nhoff, - mac_len); + mac_offset, mac_len); goto out; } for (skb = segs; skb; skb = skb->next) { skb->protocol = htons(ETH_P_NSH); __skb_push(skb, nsh_len); - skb_set_mac_header(skb, -nhoff); + skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 78448b6888dd..48522c688c3e 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1687,8 +1687,9 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (ct_info.timeout[0]) { if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, ct_info.timeout)) - pr_info_ratelimited("Failed to associated timeout " - "policy `%s'\n", ct_info.timeout); + OVS_NLERR(log, + "Failed to associated timeout policy '%s'", + ct_info.timeout); else ct_info.nf_ct_timeout = rcu_dereference( nf_ct_timeout_find(ct_info.ct)->timeout); @@ -1896,9 +1897,9 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { struct hlist_head *head = &info->limits[i]; struct ovs_ct_limit *ct_limit; + struct hlist_node *next; - hlist_for_each_entry_rcu(ct_limit, head, hlist_node, - lockdep_ovsl_is_held()) + hlist_for_each_entry_safe(ct_limit, next, head, hlist_node) kfree_rcu(ct_limit, rcu); } kfree(ovs_net->ct_limit_info->limits); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 4f097bd3339e..4c537e74b18c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -236,10 +236,17 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); - if (unlikely(error)) - kfree_skb(skb); - else + switch (error) { + case 0: + case -EAGAIN: + case -ERESTARTSYS: + case -EINTR: consume_skb(skb); + break; + default: + kfree_skb(skb); + break; + } stats_counter = &stats->n_missed; goto out; } @@ -525,8 +532,9 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, out: if (err) skb_tx_error(skb); - kfree_skb(user_skb); - kfree_skb(nskb); + consume_skb(user_skb); + consume_skb(nskb); + return err; } @@ -902,6 +910,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; + struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -929,30 +938,32 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &new_flow->key, false, &mask); + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) { + error = -ENOMEM; + goto err_kfree_flow; + } + + ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) - goto err_kfree_flow; + goto err_kfree_key; + + ovs_flow_mask_key(&new_flow->key, key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &new_flow->key, log); + key, log); if (error) - goto err_kfree_flow; - - /* unmasked key is needed to match when ufid is not used. */ - if (ovs_identifier_is_key(&new_flow->id)) - match.key = new_flow->id.unmasked_key; - - ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); + goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); - goto err_kfree_flow; + goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, @@ -973,7 +984,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); + flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -1043,6 +1054,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); + + kfree(key); return 0; err_unlock_ovs: @@ -1050,6 +1063,8 @@ err_unlock_ovs: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); +err_kfree_key: + kfree(key); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1552,7 +1567,8 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *in if (IS_ERR(dp)) return; - WARN(dp->user_features, "Dropping previously announced user features\n"); + pr_warn("%s: Dropping previously announced user features\n", + ovs_dp_name(dp)); dp->user_features = 0; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a2696acbcd9d..9fc020fd7ecc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -266,7 +266,8 @@ static void packet_cached_dev_reset(struct packet_sock *po) static bool packet_use_direct_xmit(const struct packet_sock *po) { - return po->xmit == packet_direct_xmit; + /* Paired with WRITE_ONCE() in packet_setsockopt() */ + return READ_ONCE(po->xmit) == packet_direct_xmit; } static u16 packet_pick_tx_queue(struct sk_buff *skb) @@ -362,18 +363,20 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; + /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: - h.h1->tp_status = status; + WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: - h.h2->tp_status = status; + WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: - h.h3->tp_status = status; + WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: @@ -390,17 +393,19 @@ static int __packet_get_status(const struct packet_sock *po, void *frame) smp_rmb(); + /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); - return h.h1->tp_status; + return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); - return h.h2->tp_status; + return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); - return h.h3->tp_status; + return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -1845,7 +1850,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, */ spkt->spkt_family = dev->type; - strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); + strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* @@ -1864,12 +1869,20 @@ oom: static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { + int depth; + if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } + /* Move network header to the right position for VLAN tagged packets */ + if (likely(skb->dev->type == ARPHRD_ETHER) && + eth_type_vlan(skb->protocol) && + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); + skb_probe_transport_header(skb); } @@ -1964,7 +1977,7 @@ retry: goto retry; } - if (!dev_validate_header(dev, skb->data, len)) { + if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } @@ -2114,7 +2127,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev)) + if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; @@ -2222,8 +2235,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (snaplen > res) @@ -2383,7 +2395,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev)) + if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; @@ -2790,7 +2802,8 @@ tpacket_error: packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; - err = po->xmit(skb); + /* Paired with WRITE_ONCE() in packet_setsockopt() */ + err = READ_ONCE(po->xmit)(skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); @@ -2960,8 +2973,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; - if (sock->type == SOCK_RAW && - !dev_validate_header(dev, skb->data, len)) { + if ((sock->type == SOCK_RAW && + !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } @@ -2980,6 +2993,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; + if (unlikely(extra_len == 4)) + skb->no_fcs = 1; + + packet_parse_headers(skb, sock); + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) @@ -2988,12 +3006,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) virtio_net_hdr_set_proto(skb, &vnet_hdr); } - packet_parse_headers(skb, sock); - - if (unlikely(extra_len == 4)) - skb->no_fcs = 1; - - err = po->xmit(skb); + /* Paired with WRITE_ONCE() in packet_setsockopt() */ + err = READ_ONCE(po->xmit)(skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); @@ -3118,6 +3132,9 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, lock_sock(sk); spin_lock(&po->bind_lock); + if (!proto) + proto = po->num; + rcu_read_lock(); if (po->fanout) { @@ -3220,7 +3237,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); name[sizeof(uaddr->sa_data)] = 0; - return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); + return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -3237,8 +3254,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len if (sll->sll_family != AF_PACKET) return -EINVAL; - return packet_do_bind(sk, NULL, sll->sll_ifindex, - sll->sll_protocol ? : pkt_sk(sk)->num); + return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { @@ -3444,15 +3460,14 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } - if (pkt_sk(sk)->auxdata) { + if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; aux.tp_len = origlen; @@ -3496,7 +3511,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); return sizeof(*uaddr); @@ -3828,9 +3843,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - po->auxdata = !!val; - release_sock(sk); + packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: @@ -3842,9 +3855,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - po->origdev = !!val; - release_sock(sk); + packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: @@ -3910,7 +3921,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (val < 0 || val > 1) return -EINVAL; - po->prot_hook.ignore_outgoing = !!val; + WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: @@ -3941,7 +3952,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; - po->xmit = val ? packet_direct_xmit : dev_queue_xmit; + /* Paired with all lockless reads of po->xmit */ + WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit); return 0; } default: @@ -3992,10 +4004,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, break; case PACKET_AUXDATA: - val = po->auxdata; + val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: - val = po->origdev; + val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = po->has_vnet_hdr; @@ -4041,7 +4053,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, 0); break; case PACKET_IGNORE_OUTGOING: - val = po->prot_hook.ignore_outgoing; + val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) @@ -4228,7 +4240,7 @@ static void packet_mm_open(struct vm_area_struct *vma) struct sock *sk = sock->sk; if (sk) - atomic_inc(&pkt_sk(sk)->mapped); + atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) @@ -4238,7 +4250,7 @@ static void packet_mm_close(struct vm_area_struct *vma) struct sock *sk = sock->sk; if (sk) - atomic_dec(&pkt_sk(sk)->mapped); + atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { @@ -4333,7 +4345,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EBUSY; if (!closing) { - if (atomic_read(&po->mapped)) + if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; @@ -4436,7 +4448,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EBUSY; mutex_lock(&po->pg_vec_lock); - if (closing || atomic_read(&po->mapped) == 0) { + if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); @@ -4454,9 +4466,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); - if (atomic_read(&po->mapped)) - pr_err("packet_mmap: vma is busy: %d\n", - atomic_read(&po->mapped)); + if (atomic_long_read(&po->mapped)) + pr_err("packet_mmap: vma is busy: %ld\n", + atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); @@ -4534,7 +4546,7 @@ static int packet_mmap(struct file *file, struct socket *sock, } } - atomic_inc(&po->mapped); + atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; diff --git a/net/packet/diag.c b/net/packet/diag.c index 07812ae5ca07..a68a84574c73 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -23,9 +23,9 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_flags = 0; if (po->running) pinfo.pdi_flags |= PDI_RUNNING; - if (po->auxdata) + if (packet_sock_flag(po, PACKET_SOCK_AUXDATA)) pinfo.pdi_flags |= PDI_AUXDATA; - if (po->origdev) + if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) pinfo.pdi_flags |= PDI_ORIGDEV; if (po->has_vnet_hdr) pinfo.pdi_flags |= PDI_VNETHDR; @@ -143,7 +143,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, rp = nlmsg_data(nlh); rp->pdiag_family = AF_PACKET; rp->pdiag_type = sk->sk_type; - rp->pdiag_num = ntohs(po->num); + rp->pdiag_num = ntohs(READ_ONCE(po->num)); rp->pdiag_ino = sk_ino; sock_diag_save_cookie(sk, rp->pdiag_cookie); diff --git a/net/packet/internal.h b/net/packet/internal.h index 907f4cd2a718..878fcdca8c25 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -115,10 +115,9 @@ struct packet_sock { int copy_thresh; spinlock_t bind_lock; struct mutex pg_vec_lock; + unsigned long flags; unsigned int running; /* bind_lock must be held */ - unsigned int auxdata:1, /* writer must hold sock lock */ - origdev:1, - has_vnet_hdr:1, + unsigned int has_vnet_hdr:1, /* writer must hold sock lock */ tp_loss:1, tp_tx_has_off:1; int pressure; @@ -126,7 +125,7 @@ struct packet_sock { __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; - atomic_t mapped; + atomic_long_t mapped; enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; @@ -143,4 +142,25 @@ static struct packet_sock *pkt_sk(struct sock *sk) return (struct packet_sock *)sk; } +enum packet_sock_flags { + PACKET_SOCK_ORIGDEV, + PACKET_SOCK_AUXDATA, +}; + +static inline void packet_sock_flag_set(struct packet_sock *po, + enum packet_sock_flags flag, + bool val) +{ + if (val) + set_bit(flag, &po->flags); + else + clear_bit(flag, &po->flags); +} + +static inline bool packet_sock_flag(const struct packet_sock *po, + enum packet_sock_flags flag) +{ + return test_bit(flag, &po->flags); +} + #endif diff --git a/net/psample/psample.c b/net/psample/psample.c index 6f2fbc6b9eb2..6d29983cbd07 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -28,7 +28,8 @@ enum psample_nl_multicast_groups { static const struct genl_multicast_group psample_nl_mcgrps[] = { [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, - [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME }, + [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME, + .flags = GENL_UNS_ADMIN_PERM }, }; static struct genl_family psample_nl_family __ro_after_init; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 1a5bf3fa4578..af22f47c8612 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -421,7 +421,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, rs->rs_rx_traces = trace.rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { - if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { + if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { rs->rs_rx_traces = 0; return -EFAULT; } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index a0f99bbf362c..6dd3f45df9ca 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -363,6 +363,7 @@ static int acquire_refill(struct rds_connection *conn) static void release_refill(struct rds_connection *conn) { clear_bit(RDS_RECV_REFILL, &conn->c_flags); + smp_mb__after_atomic(); /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk diff --git a/net/rds/message.c b/net/rds/message.c index 92b6b22884d4..75043742d243 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_lock_irqsave(&q->lock, flags); head = &q->zcookie_head; if (!list_empty(head)) { - info = list_entry(head, struct rds_msg_zcopy_info, - rs_zcookie_next); - if (info && rds_zcookie_add(info, cookie)) { + info = list_first_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (rds_zcookie_add(info, cookie)) { spin_unlock_irqrestore(&q->lock, flags); kfree(rds_info_from_znotifier(znotif)); /* caller invokes rds_wake_sk_sleep() */ @@ -118,7 +118,7 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, ck = &info->zcookies; memset(ck, 0, sizeof(*ck)); WARN_ON(!rds_zcookie_add(info, cookie)); - list_add_tail(&q->zcookie_head, &info->rs_zcookie_next); + list_add_tail(&info->rs_zcookie_next, &q->zcookie_head); spin_unlock_irqrestore(&q->lock, flags); /* caller invokes rds_wake_sk_sleep() */ diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 1c42a600fe7f..ab785a406fad 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -279,6 +279,9 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, put_page(sg_page(&sg[i])); kfree(sg); ret = PTR_ERR(trans_private); + /* Trigger connection so that its ready for the next retry */ + if (ret == -ENODEV && cp) + rds_conn_connect_if_down(cp->cp_conn); goto out; } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 5f741e51b4ba..bb38124a5d3d 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -86,10 +86,12 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ADDR_RESOLVED: - rdma_set_service_type(cm_id, conn->c_tos); - /* XXX do we need to clean up if this fails? */ - ret = rdma_resolve_route(cm_id, + if (conn) { + rdma_set_service_type(cm_id, conn->c_tos); + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, RDS_RDMA_RESOLVE_TIMEOUT_MS); + } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: diff --git a/net/rds/send.c b/net/rds/send.c index 68e2bdb08fd0..c0cebf4b4fe5 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -103,13 +103,12 @@ EXPORT_SYMBOL_GPL(rds_send_path_reset); static int acquire_in_xmit(struct rds_conn_path *cp) { - return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0; + return test_and_set_bit_lock(RDS_IN_XMIT, &cp->cp_flags) == 0; } static void release_in_xmit(struct rds_conn_path *cp) { - clear_bit(RDS_IN_XMIT, &cp->cp_flags); - smp_mb__after_atomic(); + clear_bit_unlock(RDS_IN_XMIT, &cp->cp_flags); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk @@ -1314,12 +1313,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); - if (ret) { - /* Trigger connection so that its ready for the next retry */ - if (ret == -EAGAIN) - rds_conn_connect_if_down(conn); + if (ret) goto out; - } if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", diff --git a/net/rds/tcp.c b/net/rds/tcp.c index d55d81b01d37..bfb8d4d6a994 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -176,10 +176,10 @@ void rds_tcp_reset_callbacks(struct socket *sock, */ atomic_set(&cp->cp_state, RDS_CONN_RESETTING); wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); - lock_sock(osock->sk); /* reset receive side state for rds_tcp_data_recv() for osock */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); + lock_sock(osock->sk); if (tc->t_tinc) { rds_inc_put(&tc->t_tinc->ti_inc); tc->t_tinc = NULL; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 008f50fb25dd..63efe60fda1f 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -141,7 +141,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = sock->ops->bind(sock, addr, addrlen); + ret = kernel_bind(sock, addr, addrlen); if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); @@ -169,7 +169,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); + ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 26a3e18e460d..dfeceed3b533 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -332,7 +332,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) addr_len = sizeof(*sin); } - ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index f5afc9bcdee6..f74baefd855d 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -98,13 +98,13 @@ static int rfkill_gpio_probe(struct platform_device *pdev) rfkill->clk = devm_clk_get(&pdev->dev, NULL); - gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_OUT_LOW); + gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_ASIS); if (IS_ERR(gpio)) return PTR_ERR(gpio); rfkill->reset_gpio = gpio; - gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_OUT_LOW); + gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_ASIS); if (IS_ERR(gpio)) return PTR_ERR(gpio); @@ -116,6 +116,14 @@ static int rfkill_gpio_probe(struct platform_device *pdev) return -EINVAL; } + ret = gpiod_direction_output(rfkill->reset_gpio, true); + if (ret) + return ret; + + ret = gpiod_direction_output(rfkill->shutdown_gpio, true); + if (ret) + return ret; + rfkill->rfkill_dev = rfkill_alloc(rfkill->name, &pdev->dev, rfkill->type, &rfkill_gpio_ops, rfkill); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 6a0df7c8a939..9b36fb6aa3e1 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -159,20 +159,47 @@ void rose_kill_by_neigh(struct rose_neigh *neigh) */ static void rose_kill_by_device(struct net_device *dev) { - struct sock *s; + struct sock *sk, *array[16]; + struct rose_sock *rose; + bool rescan; + int i, cnt; +start: + rescan = false; + cnt = 0; spin_lock_bh(&rose_list_lock); - sk_for_each(s, &rose_list) { - struct rose_sock *rose = rose_sk(s); + sk_for_each(sk, &rose_list) { + rose = rose_sk(sk); + if (rose->device == dev) { + if (cnt == ARRAY_SIZE(array)) { + rescan = true; + break; + } + sock_hold(sk); + array[cnt++] = sk; + } + } + spin_unlock_bh(&rose_list_lock); + for (i = 0; i < cnt; i++) { + sk = array[cnt]; + rose = rose_sk(sk); + lock_sock(sk); + spin_lock_bh(&rose_list_lock); if (rose->device == dev) { - rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); + rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) rose->neighbour->use--; + dev_put(rose->device); rose->device = NULL; } + spin_unlock_bh(&rose_list_lock); + release_sock(sk); + sock_put(sk); + cond_resched(); } - spin_unlock_bh(&rose_list_lock); + if (rescan) + goto start; } /* @@ -464,6 +491,12 @@ static int rose_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { struct rose_sock *rose = rose_sk(sk); @@ -473,8 +506,10 @@ static int rose_listen(struct socket *sock, int backlog) memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; + release_sock(sk); return 0; } + release_sock(sk); return -EOPNOTSUPP; } @@ -569,6 +604,8 @@ static struct sock *rose_make_new(struct sock *osk) rose->idle = orose->idle; rose->defer = orose->defer; rose->device = orose->device; + if (rose->device) + dev_hold(rose->device); rose->qbitincl = orose->qbitincl; return sk; @@ -622,6 +659,10 @@ static int rose_release(struct socket *sock) break; } + spin_lock_bh(&rose_list_lock); + dev_put(rose->device); + rose->device = NULL; + spin_unlock_bh(&rose_list_lock); sock->sk = NULL; release_sock(sk); sock_put(sk); @@ -698,7 +739,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le struct rose_sock *rose = rose_sk(sk); struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; unsigned char cause, diagnostic; - struct net_device *dev; ax25_uid_assoc *user; int n, err = 0; @@ -755,9 +795,12 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le } if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ + struct net_device *dev; + sock_reset_flag(sk, SOCK_ZAPPED); - if ((dev = rose_dev_first()) == NULL) { + dev = rose_dev_first(); + if (!dev) { err = -ENETUNREACH; goto out_release; } @@ -765,6 +808,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le user = ax25_findbyuid(current_euid()); if (!user) { err = -EINVAL; + dev_put(dev); goto out_release; } @@ -1270,9 +1314,11 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: { struct sk_buff *skb; long amount = 0L; - /* These two are safe on a single CPU system as only user tasks fiddle here */ + + spin_lock_irq(&sk->sk_receive_queue.lock); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; + spin_unlock_irq(&sk->sk_receive_queue.lock); return put_user(amount, (unsigned int __user *) argp); } diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index f6102e6f5161..730d2205f197 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -236,6 +236,9 @@ void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, uns unsigned char *dptr; int len; + if (!neigh->dev) + return; + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 11c45c8c6c16..036d92c0ad79 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -96,7 +96,8 @@ static void rose_loopback_timer(struct timer_list *unused) } if (frametype == ROSE_CALL_REQUEST) { - if (!rose_loopback_neigh->dev) { + if (!rose_loopback_neigh->dev && + !rose_loopback_neigh->loopback) { kfree_skb(skb); continue; } diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 5f32113c9bbd..64d441d3b653 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -613,6 +613,8 @@ struct net_device *rose_dev_first(void) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } + if (first) + dev_hold(first); rcu_read_unlock(); return first; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8574e7066d94..b5f173960725 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -166,7 +166,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); now = ktime_get_real(); - max_age = ktime_sub(now, jiffies_to_usecs(call->peer->rto_j)); + max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); spin_lock_bh(&call->lock); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b5864683f200..49669055ee9d 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -41,6 +41,14 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, _enter("%d", conn->debug_id); + if (sp && sp->hdr.type == RXRPC_PACKET_TYPE_ACK) { + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &pkt.ack, sizeof(pkt.ack)) < 0) + return; + if (pkt.ack.reason == RXRPC_ACK_PING_RESPONSE) + return; + } + chan = &conn->channels[channel]; /* If the last call got moved on whilst we were waiting to run, just diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 21da48e3d2e5..7ad4b4e9341e 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -25,7 +25,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer, struct rxrpc_conn_proto k; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rb_node *p; - unsigned int seq = 0; + unsigned int seq = 1; k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; @@ -35,6 +35,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer, * under just the RCU read lock, so we have to check for * changes. */ + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&peer->service_conn_lock, &seq); p = rcu_dereference_raw(peer->service_conns.rb_node); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 01135e54d95d..fc784fcc3a94 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -448,6 +448,9 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; + if (local->dead) + return; + trace_rxrpc_local(local->debug_id, rxrpc_local_processing, atomic_read(&local->usage), NULL); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 6202d2e32914..09fcc54245c7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -93,7 +93,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, *_hard_ack = hard_ack; *_top = top; - pkt->ack.bufferSpace = htons(8); + pkt->ack.bufferSpace = htons(0); pkt->ack.maxSkew = htons(0); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_highest_seq); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 52a24d4ef5d8..2ba61971f623 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -451,7 +451,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, * directly into the target buffer. */ sg = _sg; - nsg = skb_shinfo(skb)->nr_frags; + nsg = skb_shinfo(skb)->nr_frags + 1; if (nsg <= 4) { nsg = 4; } else { diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 22f020099214..3439d14168e8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -718,7 +718,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) - goto error_put; + goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } @@ -738,7 +738,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* Fall through */ case 1: if (p.call.timeouts.hard > 0) { - j = msecs_to_jiffies(p.call.timeouts.hard); + j = p.call.timeouts.hard * HZ; now = jiffies; j += now; WRITE_ONCE(call->expect_term_by, j); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2985509147a2..49521aa33ab9 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -45,23 +45,6 @@ if NET_SCHED comment "Queueing/Scheduling" -config NET_SCH_CBQ - tristate "Class Based Queueing (CBQ)" - ---help--- - Say Y here if you want to use the Class-Based Queueing (CBQ) packet - scheduling algorithm. This algorithm classifies the waiting packets - into a tree-like hierarchy of classes; the leaves of this tree are - in turn scheduled by separate algorithms. - - See the top of <file:net/sched/sch_cbq.c> for more details. - - CBQ is a commonly used scheduler, so if you're unsure, you should - say Y here. Then say Y to all the queueing algorithms below that you - want to use as leaf disciplines. - - To compile this code as a module, choose M here: the - module will be called sch_cbq. - config NET_SCH_HTB tristate "Hierarchical Token Bucket (HTB)" ---help--- @@ -85,20 +68,6 @@ config NET_SCH_HFSC To compile this code as a module, choose M here: the module will be called sch_hfsc. -config NET_SCH_ATM - tristate "ATM Virtual Circuits (ATM)" - depends on ATM - ---help--- - Say Y here if you want to use the ATM pseudo-scheduler. This - provides a framework for invoking classifiers, which in turn - select classes of this queuing discipline. Each class maps - the flow(s) it is handling to a given virtual circuit. - - See the top of <file:net/sched/sch_atm.c> for more details. - - To compile this code as a module, choose M here: the - module will be called sch_atm. - config NET_SCH_PRIO tristate "Multi Band Priority Queueing (PRIO)" ---help--- @@ -217,17 +186,6 @@ config NET_SCH_GRED To compile this code as a module, choose M here: the module will be called sch_gred. -config NET_SCH_DSMARK - tristate "Differentiated Services marker (DSMARK)" - ---help--- - Say Y if you want to schedule packets according to the - Differentiated Services architecture proposed in RFC 2475. - Technical information on this method, with pointers to associated - RFCs, is available at <http://www.gta.ufrj.br/diffserv/>. - - To compile this code as a module, choose M here: the - module will be called sch_dsmark. - config NET_SCH_NETEM tristate "Network emulator (NETEM)" ---help--- @@ -469,17 +427,6 @@ config NET_CLS_BASIC To compile this code as a module, choose M here: the module will be called cls_basic. -config NET_CLS_TCINDEX - tristate "Traffic-Control Index (TCINDEX)" - select NET_CLS - ---help--- - Say Y here if you want to be able to classify packets based on - traffic control indices. You will want this feature if you want - to implement Differentiated Services together with DSMARK. - - To compile this code as a module, choose M here: the - module will be called cls_tcindex. - config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" depends on INET @@ -525,34 +472,6 @@ config CLS_U32_MARK ---help--- Say Y here to be able to use netfilter marks as u32 key. -config NET_CLS_RSVP - tristate "IPv4 Resource Reservation Protocol (RSVP)" - select NET_CLS - ---help--- - The Resource Reservation Protocol (RSVP) permits end systems to - request a minimum and maximum data flow rate for a connection; this - is important for real time data such as streaming sound or video. - - Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests. - - To compile this code as a module, choose M here: the - module will be called cls_rsvp. - -config NET_CLS_RSVP6 - tristate "IPv6 Resource Reservation Protocol (RSVP6)" - select NET_CLS - ---help--- - The Resource Reservation Protocol (RSVP) permits end systems to - request a minimum and maximum data flow rate for a connection; this - is important for real time data such as streaming sound or video. - - Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests and you are using the IPv6 protocol. - - To compile this code as a module, choose M here: the - module will be called cls_rsvp6. - config NET_CLS_FLOW tristate "Flow classifier" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 415d1e1f237e..e5ea44ec13c5 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -31,20 +31,17 @@ obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_ACT_CT) += act_ct.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o -obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o -obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o -obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o @@ -65,9 +62,6 @@ obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o -obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o -obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o -obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b87d2a1ee0b1..e3f28cb03f7e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -244,7 +244,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } - if (unlikely(!(dev->flags & IFF_UP))) { + if (unlikely(!(dev->flags & IFF_UP)) || !netif_carrier_ok(dev)) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto out; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 0fccae356dc1..197915332b42 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -116,6 +116,11 @@ static int valid_label(const struct nlattr *attr, { const u32 *label = nla_data(attr); + if (nla_len(attr) != sizeof(*label)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid MPLS label length"); + return -EINVAL; + } + if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) { NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range"); return -EINVAL; @@ -128,7 +133,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 }, [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, - [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), + [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + valid_label), [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7), [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1), [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index f095a0fb75c6..bf74f3f4c752 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -26,6 +26,7 @@ static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, + [TCA_PEDIT_PARMS_EX] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 214f4efdd992..909a685b9162 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -54,8 +54,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, sample_policy, NULL); if (ret < 0) return ret; - if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] || - !tb[TCA_SAMPLE_PSAMPLE_GROUP]) + + if (!tb[TCA_SAMPLE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_SAMPLE_PARMS]); @@ -79,6 +79,13 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + + if (!tb[TCA_SAMPLE_RATE] || !tb[TCA_SAMPLE_PSAMPLE_GROUP]) { + NL_SET_ERR_MSG(extack, "sample rate and group are required"); + err = -EINVAL; + goto release_idr; + } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index f60d349542b1..6009eef8e98a 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -219,13 +219,13 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); struct tcf_skbmod_params *p; - struct tc_skbmod opt = { - .index = d->tcf_index, - .refcnt = refcount_read(&d->tcf_refcnt) - ref, - .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - }; + struct tc_skbmod opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + opt.index = d->tcf_index; + opt.refcnt = refcount_read(&d->tcf_refcnt) - ref, + opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; p = rcu_dereference_protected(d->skbmod_p, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 919c7fa5f02d..30e30b5a9baf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -40,8 +40,6 @@ #include <net/tc_act/tc_mpls.h> #include <net/flow_offload.h> -extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; - /* The list of all installed classifier types */ static LIST_HEAD(tcf_proto_base); @@ -520,8 +518,8 @@ static void __tcf_chain_put(struct tcf_chain *chain, bool by_act, { struct tcf_block *block = chain->block; const struct tcf_proto_ops *tmplt_ops; + unsigned int refcnt, non_act_refcnt; bool free_block = false; - unsigned int refcnt; void *tmplt_priv; mutex_lock(&block->lock); @@ -541,13 +539,15 @@ static void __tcf_chain_put(struct tcf_chain *chain, bool by_act, * save these to temporary variables. */ refcnt = --chain->refcnt; + non_act_refcnt = refcnt - chain->action_refcnt; tmplt_ops = chain->tmplt_ops; tmplt_priv = chain->tmplt_priv; - /* The last dropped non-action reference will trigger notification. */ - if (refcnt - chain->action_refcnt == 0 && !by_act) { - tc_chain_notify_delete(tmplt_ops, tmplt_priv, chain->index, - block, NULL, 0, 0, false); + if (non_act_refcnt == chain->explicitly_created && !by_act) { + if (non_act_refcnt == 0) + tc_chain_notify_delete(tmplt_ops, tmplt_priv, + chain->index, block, NULL, 0, 0, + false); /* Last reference to chain, no need to lock. */ chain->flushing = false; } @@ -1079,7 +1079,7 @@ static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, /* Find qdisc */ if (!*parent) { - *q = dev->qdisc; + *q = rcu_dereference(dev->qdisc); *parent = (*q)->handle; } else { *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); @@ -1500,6 +1500,7 @@ static int tcf_block_bind(struct tcf_block *block, err_unroll: list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->driver_list); if (i-- > 0) { list_del(&block_cb->list); tcf_block_playback_offloads(block, block_cb->cb, @@ -2098,6 +2099,7 @@ replay: } if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { + tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); err = -EINVAL; goto errout; @@ -2551,7 +2553,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) parent = tcm->tcm_parent; if (!parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) @@ -2732,6 +2734,7 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, return PTR_ERR(ops); if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); + module_put(ops->owner); return -EOPNOTSUPP; } @@ -2937,7 +2940,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) parent = tcm->tcm_parent; if (!parent) { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); parent = q->handle; } else { q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index ab53a93b2f2b..87398af2715a 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -225,7 +225,7 @@ static u32 flow_get_skgid(const struct sk_buff *skb) static u32 flow_get_vlan_tag(const struct sk_buff *skb) { - u16 uninitialized_var(tag); + u16 tag; if (vlan_get_tag(skb, &tag) < 0) return 0; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 007fbc199352..c92318f68f92 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -270,14 +270,16 @@ static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, return NULL; } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey, - struct fl_flow_key *key) +static noinline_for_stack +struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key) { + struct fl_flow_key mkey; + + fl_set_masked_key(&mkey, key, mask); if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) - return fl_lookup_range(mask, mkey, key); + return fl_lookup_range(mask, &mkey, key); - return __fl_lookup(mask, mkey); + return __fl_lookup(mask, &mkey); } static u16 fl_ct_info_to_flower_map[] = { @@ -297,7 +299,6 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - struct fl_flow_key skb_mkey; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -317,9 +318,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); - fl_set_masked_key(&skb_mkey, &skb_key, mask); - - f = fl_lookup(mask, &skb_mkey, &skb_key); + f = fl_mask_lookup(mask, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -720,7 +719,8 @@ static void fl_set_key_val(struct nlattr **tb, } static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, - struct fl_flow_key *mask) + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) { fl_set_key_val(tb, &key->tp_range.tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst, @@ -735,13 +735,32 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src)); - if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && - htons(key->tp_range.tp_max.dst) <= - htons(key->tp_range.tp_min.dst)) || - (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && - htons(key->tp_range.tp_max.src) <= - htons(key->tp_range.tp_min.src))) + if (mask->tp_range.tp_min.dst != mask->tp_range.tp_max.dst) { + NL_SET_ERR_MSG(extack, + "Both min and max destination ports must be specified"); return -EINVAL; + } + if (mask->tp_range.tp_min.src != mask->tp_range.tp_max.src) { + NL_SET_ERR_MSG(extack, + "Both min and max source ports must be specified"); + return -EINVAL; + } + if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && + htons(key->tp_range.tp_max.dst) <= + htons(key->tp_range.tp_min.dst)) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_PORT_DST_MIN], + "Invalid destination port range (min must be strictly smaller than max)"); + return -EINVAL; + } + if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && + htons(key->tp_range.tp_max.src) <= + htons(key->tp_range.tp_min.src)) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_PORT_SRC_MIN], + "Invalid source port range (min must be strictly smaller than max)"); + return -EINVAL; + } return 0; } @@ -870,6 +889,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, if (option_len > sizeof(struct geneve_opt)) data_len = option_len - sizeof(struct geneve_opt); + if (key->enc_opts.len > FLOW_DIS_TUN_OPTS_MAX - 4) + return -ERANGE; + opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len]; memset(opt, 0xff, option_len); opt->length = data_len / 4; @@ -1209,7 +1231,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (key->basic.ip_proto == IPPROTO_TCP || key->basic.ip_proto == IPPROTO_UDP || key->basic.ip_proto == IPPROTO_SCTP) { - ret = fl_set_key_port_range(tb, key, mask); + ret = fl_set_key_port_range(tb, key, mask, extack); if (ret) return ret; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index ec945294626a..08c41f1976c4 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -210,11 +210,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, if (err < 0) return err; - if (tb[TCA_FW_CLASSID]) { - f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); - tcf_bind_filter(tp, &f->res, base); - } - if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); @@ -231,6 +226,11 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, } else if (head->mask != 0xFFFFFFFF) return err; + if (tb[TCA_FW_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + return 0; } @@ -266,7 +266,6 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return -ENOBUFS; fnew->id = f->id; - fnew->res = f->res; fnew->ifindex = f->ifindex; fnew->tp = f->tp; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 5efa3e7ace15..1ad4b3e60eb3 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -424,6 +424,11 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; } + if (!nhandle) { + NL_SET_ERR_MSG(extack, "Replacing with handle of 0 is invalid"); + return -EINVAL; + } + h1 = to_hash(nhandle); b = rtnl_dereference(head->table[h1]); if (!b) { @@ -477,6 +482,11 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, int err; bool new = true; + if (!handle) { + NL_SET_ERR_MSG(extack, "Creating with handle of 0 is invalid"); + return -EINVAL; + } + if (opt == NULL) return handle ? -EINVAL : 0; @@ -501,7 +511,6 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (fold) { f->id = fold->id; f->iif = fold->iif; - f->res = fold->res; f->handle = fold->handle; f->tp = fold->tp; @@ -526,7 +535,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); - if (fold && fold->handle && f->handle != fold->handle) { + if (fold) { th = to_hash(fold->handle); h = from_hash(fold->handle >> 16); b = rtnl_dereference(head->table[th]); diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c deleted file mode 100644 index de1c1d4da597..000000000000 --- a/net/sched/cls_rsvp.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <net/ip.h> -#include <net/netlink.h> -#include <net/act_api.h> -#include <net/pkt_cls.h> - -#define RSVP_DST_LEN 1 -#define RSVP_ID "rsvp" -#define RSVP_OPS cls_rsvp_ops - -#include "cls_rsvp.h" -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h deleted file mode 100644 index d36949d9382c..000000000000 --- a/net/sched/cls_rsvp.h +++ /dev/null @@ -1,777 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -/* - Comparing to general packet classification problem, - RSVP needs only sevaral relatively simple rules: - - * (dst, protocol) are always specified, - so that we are able to hash them. - * src may be exact, or may be wildcard, so that - we can keep a hash table plus one wildcard entry. - * source port (or flow label) is important only if src is given. - - IMPLEMENTATION. - - We use a two level hash table: The top level is keyed by - destination address and protocol ID, every bucket contains a list - of "rsvp sessions", identified by destination address, protocol and - DPI(="Destination Port ID"): triple (key, mask, offset). - - Every bucket has a smaller hash table keyed by source address - (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. - Every bucket is again a list of "RSVP flows", selected by - source address and SPI(="Source Port ID" here rather than - "security parameter index"): triple (key, mask, offset). - - - NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) - and all fragmented packets go to the best-effort traffic class. - - - NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires - only one "Generalized Port Identifier". So that for classic - ah, esp (and udp,tcp) both *pi should coincide or one of them - should be wildcard. - - At first sight, this redundancy is just a waste of CPU - resources. But DPI and SPI add the possibility to assign different - priorities to GPIs. Look also at note 4 about tunnels below. - - - NOTE 3. One complication is the case of tunneled packets. - We implement it as following: if the first lookup - matches a special session with "tunnelhdr" value not zero, - flowid doesn't contain the true flow ID, but the tunnel ID (1...255). - In this case, we pull tunnelhdr bytes and restart lookup - with tunnel ID added to the list of keys. Simple and stupid 8)8) - It's enough for PIMREG and IPIP. - - - NOTE 4. Two GPIs make it possible to parse even GRE packets. - F.e. DPI can select ETH_P_IP (and necessary flags to make - tunnelhdr correct) in GRE protocol field and SPI matches - GRE key. Is it not nice? 8)8) - - - Well, as result, despite its simplicity, we get a pretty - powerful classification engine. */ - - -struct rsvp_head { - u32 tmap[256/32]; - u32 hgenerator; - u8 tgenerator; - struct rsvp_session __rcu *ht[256]; - struct rcu_head rcu; -}; - -struct rsvp_session { - struct rsvp_session __rcu *next; - __be32 dst[RSVP_DST_LEN]; - struct tc_rsvp_gpi dpi; - u8 protocol; - u8 tunnelid; - /* 16 (src,sport) hash slots, and one wildcard source slot */ - struct rsvp_filter __rcu *ht[16 + 1]; - struct rcu_head rcu; -}; - - -struct rsvp_filter { - struct rsvp_filter __rcu *next; - __be32 src[RSVP_DST_LEN]; - struct tc_rsvp_gpi spi; - u8 tunnelhdr; - - struct tcf_result res; - struct tcf_exts exts; - - u32 handle; - struct rsvp_session *sess; - struct rcu_work rwork; -}; - -static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) -{ - unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1]; - - h ^= h>>16; - h ^= h>>8; - return (h ^ protocol ^ tunnelid) & 0xFF; -} - -static inline unsigned int hash_src(__be32 *src) -{ - unsigned int h = (__force __u32)src[RSVP_DST_LEN-1]; - - h ^= h>>16; - h ^= h>>8; - h ^= h>>4; - return h & 0xF; -} - -#define RSVP_APPLY_RESULT() \ -{ \ - int r = tcf_exts_exec(skb, &f->exts, res); \ - if (r < 0) \ - continue; \ - else if (r > 0) \ - return r; \ -} - -static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct rsvp_head *head = rcu_dereference_bh(tp->root); - struct rsvp_session *s; - struct rsvp_filter *f; - unsigned int h1, h2; - __be32 *dst, *src; - u8 protocol; - u8 tunnelid = 0; - u8 *xprt; -#if RSVP_DST_LEN == 4 - struct ipv6hdr *nhptr; - - if (!pskb_network_may_pull(skb, sizeof(*nhptr))) - return -1; - nhptr = ipv6_hdr(skb); -#else - struct iphdr *nhptr; - - if (!pskb_network_may_pull(skb, sizeof(*nhptr))) - return -1; - nhptr = ip_hdr(skb); -#endif -restart: - -#if RSVP_DST_LEN == 4 - src = &nhptr->saddr.s6_addr32[0]; - dst = &nhptr->daddr.s6_addr32[0]; - protocol = nhptr->nexthdr; - xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr); -#else - src = &nhptr->saddr; - dst = &nhptr->daddr; - protocol = nhptr->protocol; - xprt = ((u8 *)nhptr) + (nhptr->ihl<<2); - if (ip_is_fragment(nhptr)) - return -1; -#endif - - h1 = hash_dst(dst, protocol, tunnelid); - h2 = hash_src(src); - - for (s = rcu_dereference_bh(head->ht[h1]); s; - s = rcu_dereference_bh(s->next)) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && - protocol == s->protocol && - !(s->dpi.mask & - (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) && -#if RSVP_DST_LEN == 4 - dst[0] == s->dst[0] && - dst[1] == s->dst[1] && - dst[2] == s->dst[2] && -#endif - tunnelid == s->tunnelid) { - - for (f = rcu_dereference_bh(s->ht[h2]); f; - f = rcu_dereference_bh(f->next)) { - if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && - !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) -#if RSVP_DST_LEN == 4 - && - src[0] == f->src[0] && - src[1] == f->src[1] && - src[2] == f->src[2] -#endif - ) { - *res = f->res; - RSVP_APPLY_RESULT(); - -matched: - if (f->tunnelhdr == 0) - return 0; - - tunnelid = f->res.classid; - nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr)); - goto restart; - } - } - - /* And wildcard bucket... */ - for (f = rcu_dereference_bh(s->ht[16]); f; - f = rcu_dereference_bh(f->next)) { - *res = f->res; - RSVP_APPLY_RESULT(); - goto matched; - } - return -1; - } - } - return -1; -} - -static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_session *s; - struct rsvp_filter __rcu **ins; - struct rsvp_filter *pins; - unsigned int h1 = h & 0xFF; - unsigned int h2 = (h >> 8) & 0xFF; - - for (s = rtnl_dereference(head->ht[h1]); s; - s = rtnl_dereference(s->next)) { - for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ; - ins = &pins->next, pins = rtnl_dereference(*ins)) { - if (pins->handle == h) { - RCU_INIT_POINTER(n->next, pins->next); - rcu_assign_pointer(*ins, n); - return; - } - } - } - - /* Something went wrong if we are trying to replace a non-existant - * node. Mind as well halt instead of silently failing. - */ - BUG_ON(1); -} - -static void *rsvp_get(struct tcf_proto *tp, u32 handle) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_session *s; - struct rsvp_filter *f; - unsigned int h1 = handle & 0xFF; - unsigned int h2 = (handle >> 8) & 0xFF; - - if (h2 > 16) - return NULL; - - for (s = rtnl_dereference(head->ht[h1]); s; - s = rtnl_dereference(s->next)) { - for (f = rtnl_dereference(s->ht[h2]); f; - f = rtnl_dereference(f->next)) { - if (f->handle == handle) - return f; - } - } - return NULL; -} - -static int rsvp_init(struct tcf_proto *tp) -{ - struct rsvp_head *data; - - data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL); - if (data) { - rcu_assign_pointer(tp->root, data); - return 0; - } - return -ENOBUFS; -} - -static void __rsvp_delete_filter(struct rsvp_filter *f) -{ - tcf_exts_destroy(&f->exts); - tcf_exts_put_net(&f->exts); - kfree(f); -} - -static void rsvp_delete_filter_work(struct work_struct *work) -{ - struct rsvp_filter *f = container_of(to_rcu_work(work), - struct rsvp_filter, - rwork); - rtnl_lock(); - __rsvp_delete_filter(f); - rtnl_unlock(); -} - -static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) -{ - tcf_unbind_filter(tp, &f->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (tcf_exts_get_net(&f->exts)) - tcf_queue_work(&f->rwork, rsvp_delete_filter_work); - else - __rsvp_delete_filter(f); -} - -static void rsvp_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - int h1, h2; - - if (data == NULL) - return; - - for (h1 = 0; h1 < 256; h1++) { - struct rsvp_session *s; - - while ((s = rtnl_dereference(data->ht[h1])) != NULL) { - RCU_INIT_POINTER(data->ht[h1], s->next); - - for (h2 = 0; h2 <= 16; h2++) { - struct rsvp_filter *f; - - while ((f = rtnl_dereference(s->ht[h2])) != NULL) { - rcu_assign_pointer(s->ht[h2], f->next); - rsvp_delete_filter(tp, f); - } - } - kfree_rcu(s, rcu); - } - } - kfree_rcu(data, rcu); -} - -static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_filter *nfp, *f = arg; - struct rsvp_filter __rcu **fp; - unsigned int h = f->handle; - struct rsvp_session __rcu **sp; - struct rsvp_session *nsp, *s = f->sess; - int i, h1; - - fp = &s->ht[(h >> 8) & 0xFF]; - for (nfp = rtnl_dereference(*fp); nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) { - if (nfp == f) { - RCU_INIT_POINTER(*fp, f->next); - rsvp_delete_filter(tp, f); - - /* Strip tree */ - - for (i = 0; i <= 16; i++) - if (s->ht[i]) - goto out; - - /* OK, session has no flows */ - sp = &head->ht[h & 0xFF]; - for (nsp = rtnl_dereference(*sp); nsp; - sp = &nsp->next, nsp = rtnl_dereference(*sp)) { - if (nsp == s) { - RCU_INIT_POINTER(*sp, s->next); - kfree_rcu(s, rcu); - goto out; - } - } - - break; - } - } - -out: - *last = true; - for (h1 = 0; h1 < 256; h1++) { - if (rcu_access_pointer(head->ht[h1])) { - *last = false; - break; - } - } - - return 0; -} - -static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - int i = 0xFFFF; - - while (i-- > 0) { - u32 h; - - if ((data->hgenerator += 0x10000) == 0) - data->hgenerator = 0x10000; - h = data->hgenerator|salt; - if (!rsvp_get(tp, h)) - return h; - } - return 0; -} - -static int tunnel_bts(struct rsvp_head *data) -{ - int n = data->tgenerator >> 5; - u32 b = 1 << (data->tgenerator & 0x1F); - - if (data->tmap[n] & b) - return 0; - data->tmap[n] |= b; - return 1; -} - -static void tunnel_recycle(struct rsvp_head *data) -{ - struct rsvp_session __rcu **sht = data->ht; - u32 tmap[256/32]; - int h1, h2; - - memset(tmap, 0, sizeof(tmap)); - - for (h1 = 0; h1 < 256; h1++) { - struct rsvp_session *s; - for (s = rtnl_dereference(sht[h1]); s; - s = rtnl_dereference(s->next)) { - for (h2 = 0; h2 <= 16; h2++) { - struct rsvp_filter *f; - - for (f = rtnl_dereference(s->ht[h2]); f; - f = rtnl_dereference(f->next)) { - if (f->tunnelhdr == 0) - continue; - data->tgenerator = f->res.classid; - tunnel_bts(data); - } - } - } - } - - memcpy(data->tmap, tmap, sizeof(tmap)); -} - -static u32 gen_tunnel(struct rsvp_head *data) -{ - int i, k; - - for (k = 0; k < 2; k++) { - for (i = 255; i > 0; i--) { - if (++data->tgenerator == 0) - data->tgenerator = 1; - if (tunnel_bts(data)) - return data->tgenerator; - } - tunnel_recycle(data); - } - return 0; -} - -static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { - [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, - [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, -}; - -static int rsvp_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, - u32 handle, - struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - struct rsvp_filter *f, *nfp; - struct rsvp_filter __rcu **fp; - struct rsvp_session *nsp, *s; - struct rsvp_session __rcu **sp; - struct tc_rsvp_pinfo *pinfo = NULL; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_RSVP_MAX + 1]; - struct tcf_exts e; - unsigned int h1, h2; - __be32 *dst; - int err; - - if (opt == NULL) - return handle ? -EINVAL : 0; - - err = nla_parse_nested_deprecated(tb, TCA_RSVP_MAX, opt, rsvp_policy, - NULL); - if (err < 0) - return err; - - err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, true, - extack); - if (err < 0) - goto errout2; - - f = *arg; - if (f) { - /* Node exists: adjust only classid */ - struct rsvp_filter *n; - - if (f->handle != handle && handle) - goto errout2; - - n = kmemdup(f, sizeof(*f), GFP_KERNEL); - if (!n) { - err = -ENOMEM; - goto errout2; - } - - err = tcf_exts_init(&n->exts, net, TCA_RSVP_ACT, - TCA_RSVP_POLICE); - if (err < 0) { - kfree(n); - goto errout2; - } - - if (tb[TCA_RSVP_CLASSID]) { - n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - tcf_bind_filter(tp, &n->res, base); - } - - tcf_exts_change(&n->exts, &e); - rsvp_replace(tp, n, handle); - return 0; - } - - /* Now more serious part... */ - err = -EINVAL; - if (handle) - goto errout2; - if (tb[TCA_RSVP_DST] == NULL) - goto errout2; - - err = -ENOBUFS; - f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL); - if (f == NULL) - goto errout2; - - err = tcf_exts_init(&f->exts, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); - if (err < 0) - goto errout; - h2 = 16; - if (tb[TCA_RSVP_SRC]) { - memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); - h2 = hash_src(f->src); - } - if (tb[TCA_RSVP_PINFO]) { - pinfo = nla_data(tb[TCA_RSVP_PINFO]); - f->spi = pinfo->spi; - f->tunnelhdr = pinfo->tunnelhdr; - } - if (tb[TCA_RSVP_CLASSID]) - f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - - dst = nla_data(tb[TCA_RSVP_DST]); - h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); - - err = -ENOMEM; - if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) - goto errout; - - if (f->tunnelhdr) { - err = -EINVAL; - if (f->res.classid > 255) - goto errout; - - err = -ENOMEM; - if (f->res.classid == 0 && - (f->res.classid = gen_tunnel(data)) == 0) - goto errout; - } - - for (sp = &data->ht[h1]; - (s = rtnl_dereference(*sp)) != NULL; - sp = &s->next) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && - pinfo && pinfo->protocol == s->protocol && - memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && -#if RSVP_DST_LEN == 4 - dst[0] == s->dst[0] && - dst[1] == s->dst[1] && - dst[2] == s->dst[2] && -#endif - pinfo->tunnelid == s->tunnelid) { - -insert: - /* OK, we found appropriate session */ - - fp = &s->ht[h2]; - - f->sess = s; - if (f->tunnelhdr == 0) - tcf_bind_filter(tp, &f->res, base); - - tcf_exts_change(&f->exts, &e); - - fp = &s->ht[h2]; - for (nfp = rtnl_dereference(*fp); nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) { - __u32 mask = nfp->spi.mask & f->spi.mask; - - if (mask != f->spi.mask) - break; - } - RCU_INIT_POINTER(f->next, nfp); - rcu_assign_pointer(*fp, f); - - *arg = f; - return 0; - } - } - - /* No session found. Create new one. */ - - err = -ENOBUFS; - s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL); - if (s == NULL) - goto errout; - memcpy(s->dst, dst, sizeof(s->dst)); - - if (pinfo) { - s->dpi = pinfo->dpi; - s->protocol = pinfo->protocol; - s->tunnelid = pinfo->tunnelid; - } - sp = &data->ht[h1]; - for (nsp = rtnl_dereference(*sp); nsp; - sp = &nsp->next, nsp = rtnl_dereference(*sp)) { - if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask) - break; - } - RCU_INIT_POINTER(s->next, nsp); - rcu_assign_pointer(*sp, s); - - goto insert; - -errout: - tcf_exts_destroy(&f->exts); - kfree(f); -errout2: - tcf_exts_destroy(&e); - return err; -} - -static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg, - bool rtnl_held) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - unsigned int h, h1; - - if (arg->stop) - return; - - for (h = 0; h < 256; h++) { - struct rsvp_session *s; - - for (s = rtnl_dereference(head->ht[h]); s; - s = rtnl_dereference(s->next)) { - for (h1 = 0; h1 <= 16; h1++) { - struct rsvp_filter *f; - - for (f = rtnl_dereference(s->ht[h1]); f; - f = rtnl_dereference(f->next)) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; - return; - } - arg->count++; - } - } - } - } -} - -static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) -{ - struct rsvp_filter *f = fh; - struct rsvp_session *s; - struct nlattr *nest; - struct tc_rsvp_pinfo pinfo; - - if (f == NULL) - return skb->len; - s = f->sess; - - t->tcm_handle = f->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst)) - goto nla_put_failure; - pinfo.dpi = s->dpi; - pinfo.spi = f->spi; - pinfo.protocol = s->protocol; - pinfo.tunnelid = s->tunnelid; - pinfo.tunnelhdr = f->tunnelhdr; - pinfo.pad = 0; - if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo)) - goto nla_put_failure; - if (f->res.classid && - nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid)) - goto nla_put_failure; - if (((f->handle >> 8) & 0xFF) != 16 && - nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &f->exts) < 0) - goto nla_put_failure; - - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &f->exts) < 0) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, - unsigned long base) -{ - struct rsvp_filter *f = fh; - - if (f && f->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &f->res, base); - else - __tcf_unbind_filter(q, &f->res); - } -} - -static struct tcf_proto_ops RSVP_OPS __read_mostly = { - .kind = RSVP_ID, - .classify = rsvp_classify, - .init = rsvp_init, - .destroy = rsvp_destroy, - .get = rsvp_get, - .change = rsvp_change, - .delete = rsvp_delete, - .walk = rsvp_walk, - .dump = rsvp_dump, - .bind_class = rsvp_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_rsvp(void) -{ - return register_tcf_proto_ops(&RSVP_OPS); -} - -static void __exit exit_rsvp(void) -{ - unregister_tcf_proto_ops(&RSVP_OPS); -} - -module_init(init_rsvp) -module_exit(exit_rsvp) diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c deleted file mode 100644 index 64078846000e..000000000000 --- a/net/sched/cls_rsvp6.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/ipv6.h> -#include <linux/skbuff.h> -#include <net/act_api.h> -#include <net/pkt_cls.h> -#include <net/netlink.h> - -#define RSVP_DST_LEN 4 -#define RSVP_ID "rsvp6" -#define RSVP_OPS cls_rsvp6_ops - -#include "cls_rsvp.h" -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c deleted file mode 100644 index 684187a1fdb9..000000000000 --- a/net/sched/cls_tcindex.c +++ /dev/null @@ -1,738 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/sched/cls_tcindex.c Packet classifier for skb->tc_index - * - * Written 1998,1999 by Werner Almesberger, EPFL ICA - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/refcount.h> -#include <net/act_api.h> -#include <net/netlink.h> -#include <net/pkt_cls.h> -#include <net/sch_generic.h> - -/* - * Passing parameters to the root seems to be done more awkwardly than really - * necessary. At least, u32 doesn't seem to use such dirty hacks. To be - * verified. FIXME. - */ - -#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ -#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ - - -struct tcindex_data; - -struct tcindex_filter_result { - struct tcf_exts exts; - struct tcf_result res; - struct tcindex_data *p; - struct rcu_work rwork; -}; - -struct tcindex_filter { - u16 key; - struct tcindex_filter_result result; - struct tcindex_filter __rcu *next; - struct rcu_work rwork; -}; - - -struct tcindex_data { - struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter __rcu **h; /* imperfect hash; */ - struct tcf_proto *tp; - u16 mask; /* AND key with mask */ - u32 shift; /* shift ANDed key to the right */ - u32 hash; /* hash table size; 0 if undefined */ - u32 alloc_hash; /* allocated size */ - u32 fall_through; /* 0: only classify if explicit match */ - refcount_t refcnt; /* a temporary refcnt for perfect hash */ - struct rcu_work rwork; -}; - -static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) -{ - return tcf_exts_has_actions(&r->exts) || r->res.classid; -} - -static void tcindex_data_get(struct tcindex_data *p) -{ - refcount_inc(&p->refcnt); -} - -static void tcindex_data_put(struct tcindex_data *p) -{ - if (refcount_dec_and_test(&p->refcnt)) { - kfree(p->perfect); - kfree(p->h); - kfree(p); - } -} - -static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, - u16 key) -{ - if (p->perfect) { - struct tcindex_filter_result *f = p->perfect + key; - - return tcindex_filter_is_set(f) ? f : NULL; - } else if (p->h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *f; - - fp = &p->h[key % p->hash]; - for (f = rcu_dereference_bh_rtnl(*fp); - f; - fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) - if (f->key == key) - return &f->result; - } - - return NULL; -} - - -static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct tcindex_data *p = rcu_dereference_bh(tp->root); - struct tcindex_filter_result *f; - int key = (skb->tc_index & p->mask) >> p->shift; - - pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", - skb, tp, res, p); - - f = tcindex_lookup(p, key); - if (!f) { - struct Qdisc *q = tcf_block_q(tp->chain->block); - - if (!p->fall_through) - return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); - res->class = 0; - pr_debug("alg 0x%x\n", res->classid); - return 0; - } - *res = f->res; - pr_debug("map 0x%x\n", res->classid); - - return tcf_exts_exec(skb, &f->exts, res); -} - - -static void *tcindex_get(struct tcf_proto *tp, u32 handle) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r; - - pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); - if (p->perfect && handle >= p->alloc_hash) - return NULL; - r = tcindex_lookup(p, handle); - return r && tcindex_filter_is_set(r) ? r : NULL; -} - -static int tcindex_init(struct tcf_proto *tp) -{ - struct tcindex_data *p; - - pr_debug("tcindex_init(tp %p)\n", tp); - p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); - if (!p) - return -ENOMEM; - - p->mask = 0xffff; - p->hash = DEFAULT_HASH_SIZE; - p->fall_through = 1; - refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - rcu_assign_pointer(tp->root, p); - return 0; -} - -static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) -{ - tcf_exts_destroy(&r->exts); - tcf_exts_put_net(&r->exts); - tcindex_data_put(r->p); -} - -static void tcindex_destroy_rexts_work(struct work_struct *work) -{ - struct tcindex_filter_result *r; - - r = container_of(to_rcu_work(work), - struct tcindex_filter_result, - rwork); - rtnl_lock(); - __tcindex_destroy_rexts(r); - rtnl_unlock(); -} - -static void __tcindex_destroy_fexts(struct tcindex_filter *f) -{ - tcf_exts_destroy(&f->result.exts); - tcf_exts_put_net(&f->result.exts); - kfree(f); -} - -static void tcindex_destroy_fexts_work(struct work_struct *work) -{ - struct tcindex_filter *f = container_of(to_rcu_work(work), - struct tcindex_filter, - rwork); - - rtnl_lock(); - __tcindex_destroy_fexts(f); - rtnl_unlock(); -} - -static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = arg; - struct tcindex_filter __rcu **walk; - struct tcindex_filter *f = NULL; - - pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p); - if (p->perfect) { - if (!r->res.class) - return -ENOENT; - } else { - int i; - - for (i = 0; i < p->hash; i++) { - walk = p->h + i; - for (f = rtnl_dereference(*walk); f; - walk = &f->next, f = rtnl_dereference(*walk)) { - if (&f->result == r) - goto found; - } - } - return -ENOENT; - -found: - rcu_assign_pointer(*walk, rtnl_dereference(f->next)); - } - tcf_unbind_filter(tp, &r->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (f) { - if (tcf_exts_get_net(&f->result.exts)) - tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work); - else - __tcindex_destroy_fexts(f); - } else { - tcindex_data_get(p); - - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - - *last = false; - return 0; -} - -static void tcindex_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - tcindex_data_put(p); -} - -static inline int -valid_perfect_hash(struct tcindex_data *p) -{ - return p->hash > (p->mask >> p->shift); -} - -static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { - [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, - [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, - [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, - [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, - [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, -}; - -static int tcindex_filter_result_init(struct tcindex_filter_result *r, - struct tcindex_data *p, - struct net *net) -{ - memset(r, 0, sizeof(*r)); - r->p = p; - return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, - TCA_TCINDEX_POLICE); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp); - -static void tcindex_partial_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - rtnl_lock(); - if (p->perfect) - tcindex_free_perfect_hash(p); - kfree(p); - rtnl_unlock(); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp) -{ - int i; - - for (i = 0; i < cp->hash; i++) - tcf_exts_destroy(&cp->perfect[i].exts); - kfree(cp->perfect); -} - -static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) -{ - int i, err = 0; - - cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL | __GFP_NOWARN); - if (!cp->perfect) - return -ENOMEM; - - for (i = 0; i < cp->hash; i++) { - err = tcf_exts_init(&cp->perfect[i].exts, net, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - goto errout; - cp->perfect[i].p = cp; - } - - return 0; - -errout: - tcindex_free_perfect_hash(cp); - return err; -} - -static int -tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, - u32 handle, struct tcindex_data *p, - struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) -{ - struct tcindex_filter_result new_filter_result, *old_r = r; - struct tcindex_data *cp = NULL, *oldp; - struct tcindex_filter *f = NULL; /* make gcc behave */ - struct tcf_result cr = {}; - int err, balloc = 0; - struct tcf_exts e; - - err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack); - if (err < 0) - goto errout; - - err = -ENOMEM; - /* tcindex_data attributes must look atomic to classifier/lookup so - * allocate new tcindex data and RCU assign it onto root. Keeping - * perfect hash and hash pointers from old data. - */ - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) - goto errout; - - cp->mask = p->mask; - cp->shift = p->shift; - cp->hash = p->hash; - cp->alloc_hash = p->alloc_hash; - cp->fall_through = p->fall_through; - cp->tp = tp; - refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - if (tb[TCA_TCINDEX_HASH]) - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - - if (tb[TCA_TCINDEX_MASK]) - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - - if (tb[TCA_TCINDEX_SHIFT]) { - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - if (cp->shift > 16) { - err = -EINVAL; - goto errout; - } - } - if (!cp->hash) { - /* Hash not specified, use perfect hash if the upper limit - * of the hashing index is below the threshold. - */ - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) - cp->hash = (cp->mask >> cp->shift) + 1; - else - cp->hash = DEFAULT_HASH_SIZE; - } - - if (p->perfect) { - int i; - - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout; - cp->alloc_hash = cp->hash; - for (i = 0; i < min(cp->hash, p->hash); i++) - cp->perfect[i].res = p->perfect[i].res; - balloc = 1; - } - cp->h = p->h; - - err = tcindex_filter_result_init(&new_filter_result, cp, net); - if (err < 0) - goto errout_alloc; - if (old_r) - cr = r->res; - - err = -EBUSY; - - /* Hash already allocated, make sure that we still meet the - * requirements for the allocated hash. - */ - if (cp->perfect) { - if (!valid_perfect_hash(cp) || - cp->hash > cp->alloc_hash) - goto errout_alloc; - } else if (cp->h && cp->hash != cp->alloc_hash) { - goto errout_alloc; - } - - err = -EINVAL; - if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - - if (!cp->perfect && !cp->h) - cp->alloc_hash = cp->hash; - - /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) - * but then, we'd fail handles that may become valid after some future - * mask change. While this is extremely unlikely to ever matter, - * the check below is safer (and also more backwards-compatible). - */ - if (cp->perfect || valid_perfect_hash(cp)) - if (handle >= cp->alloc_hash) - goto errout_alloc; - - - err = -ENOMEM; - if (!cp->perfect && !cp->h) { - if (valid_perfect_hash(cp)) { - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout_alloc; - balloc = 1; - } else { - struct tcindex_filter __rcu **hash; - - hash = kcalloc(cp->hash, - sizeof(struct tcindex_filter *), - GFP_KERNEL); - - if (!hash) - goto errout_alloc; - - cp->h = hash; - balloc = 2; - } - } - - if (cp->perfect) - r = cp->perfect + handle; - else - r = tcindex_lookup(cp, handle) ? : &new_filter_result; - - if (r == &new_filter_result) { - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (!f) - goto errout_alloc; - f->key = handle; - f->next = NULL; - err = tcindex_filter_result_init(&f->result, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - if (tb[TCA_TCINDEX_CLASSID]) { - cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); - tcf_bind_filter(tp, &cr, base); - } - - if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - oldp = p; - r->res = cr; - tcf_exts_change(&r->exts, &e); - - rcu_assign_pointer(tp->root, cp); - - if (r == &new_filter_result) { - struct tcindex_filter *nfp; - struct tcindex_filter __rcu **fp; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - fp = cp->h + (handle % cp->hash); - for (nfp = rtnl_dereference(*fp); - nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) - ; /* nothing */ - - rcu_assign_pointer(*fp, f); - } else { - tcf_exts_destroy(&new_filter_result.exts); - } - - if (oldp) - tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work); - return 0; - -errout_alloc: - if (balloc == 1) - tcindex_free_perfect_hash(cp); - else if (balloc == 2) - kfree(cp->h); - tcf_exts_destroy(&new_filter_result.exts); -errout: - kfree(cp); - tcf_exts_destroy(&e); - return err; -} - -static int -tcindex_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = *arg; - int err; - - pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," - "p %p,r %p,*arg %p\n", - tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL); - - if (!opt) - return 0; - - err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt, - tcindex_policy, NULL); - if (err < 0) - return err; - - return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], ovr, extack); -} - -static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, - bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter *f, *next; - int i; - - pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - if (!p->perfect[i].res.class) - continue; - if (walker->count >= walker->skip) { - if (walker->fn(tp, p->perfect + i, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; - } - } - if (!p->h) - return; - for (i = 0; i < p->hash; i++) { - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - if (walker->count >= walker->skip) { - if (walker->fn(tp, &f->result, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; - } - } -} - -static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - int i; - - pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); - - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - struct tcindex_filter_result *r = p->perfect + i; - - /* tcf_queue_work() does not guarantee the ordering we - * want, so we have to take this refcnt temporarily to - * ensure 'p' is freed after all tcindex_filter_result - * here. Imperfect hash does not need this, because it - * uses linked lists rather than an array. - */ - tcindex_data_get(p); - - tcf_unbind_filter(tp, &r->res); - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, - tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - } - - for (i = 0; p->h && i < p->hash; i++) { - struct tcindex_filter *f, *next; - bool last; - - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - tcindex_delete(tp, &f->result, &last, rtnl_held, NULL); - } - } - - tcf_queue_work(&p->rwork, tcindex_destroy_work); -} - - -static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = fh; - struct nlattr *nest; - - pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n", - tp, fh, skb, t, p, r); - pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (!fh) { - t->tcm_handle = ~0; /* whatever ... */ - if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || - nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || - nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || - nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) - goto nla_put_failure; - nla_nest_end(skb, nest); - } else { - if (p->perfect) { - t->tcm_handle = r - p->perfect; - } else { - struct tcindex_filter *f; - struct tcindex_filter __rcu **fp; - int i; - - t->tcm_handle = 0; - for (i = 0; !t->tcm_handle && i < p->hash; i++) { - fp = &p->h[i]; - for (f = rtnl_dereference(*fp); - !t->tcm_handle && f; - fp = &f->next, f = rtnl_dereference(*fp)) { - if (&f->result == r) - t->tcm_handle = f->key; - } - } - } - pr_debug("handle = %d\n", t->tcm_handle); - if (r->res.class && - nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &r->exts) < 0) - goto nla_put_failure; - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &r->exts) < 0) - goto nla_put_failure; - } - - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, - void *q, unsigned long base) -{ - struct tcindex_filter_result *r = fh; - - if (r && r->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &r->res, base); - else - __tcf_unbind_filter(q, &r->res); - } -} - -static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { - .kind = "tcindex", - .classify = tcindex_classify, - .init = tcindex_init, - .destroy = tcindex_destroy, - .get = tcindex_get, - .change = tcindex_change, - .delete = tcindex_delete, - .walk = tcindex_walk, - .dump = tcindex_dump, - .bind_class = tcindex_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_tcindex(void) -{ - return register_tcf_proto_ops(&cls_tcindex_ops); -} - -static void __exit exit_tcindex(void) -{ - unregister_tcf_proto_ops(&cls_tcindex_ops); -} - -module_init(init_tcindex) -module_exit(exit_tcindex) -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ed8d26e6468c..65598207a2fc 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -716,12 +716,18 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) { - int err; + int err, ifindex = -1; err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, true, extack); if (err < 0) return err; + if (tb[TCA_U32_INDEV]) { + ifindex = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); + if (ifindex < 0) + return -EINVAL; + } + if (tb[TCA_U32_LINK]) { u32 handle = nla_get_u32(tb[TCA_U32_LINK]); struct tc_u_hnode *ht_down = NULL, *ht_old; @@ -756,13 +762,9 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &n->res, base); } - if (tb[TCA_U32_INDEV]) { - int ret; - ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); - if (ret < 0) - return -EINVAL; - n->ifindex = ret; - } + if (ifindex >= 0) + n->ifindex = ifindex; + return 0; } @@ -812,7 +814,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->ifindex = n->ifindex; new->fshift = n->fshift; - new->res = n->res; new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); @@ -1002,18 +1003,62 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } + /* At this point, we need to derive the new handle that will be used to + * uniquely map the identity of this table match entry. The + * identity of the entry that we need to construct is 32 bits made of: + * htid(12b):bucketid(8b):node/entryid(12b) + * + * At this point _we have the table(ht)_ in which we will insert this + * entry. We carry the table's id in variable "htid". + * Note that earlier code picked the ht selection either by a) the user + * providing the htid specified via TCA_U32_HASH attribute or b) when + * no such attribute is passed then the root ht, is default to at ID + * 0x[800][00][000]. Rule: the root table has a single bucket with ID 0. + * If OTOH the user passed us the htid, they may also pass a bucketid of + * choice. 0 is fine. For example a user htid is 0x[600][01][000] it is + * indicating hash bucketid of 1. Rule: the entry/node ID _cannot_ be + * passed via the htid, so even if it was non-zero it will be ignored. + * + * We may also have a handle, if the user passed one. The handle also + * carries the same addressing of htid(12b):bucketid(8b):node/entryid(12b). + * Rule: the bucketid on the handle is ignored even if one was passed; + * rather the value on "htid" is always assumed to be the bucketid. + */ if (handle) { + /* Rule: The htid from handle and tableid from htid must match */ if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) { NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch"); return -EINVAL; } - handle = htid | TC_U32_NODE(handle); - err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, handle, - GFP_KERNEL); - if (err) - return err; - } else + /* Ok, so far we have a valid htid(12b):bucketid(8b) but we + * need to finalize the table entry identification with the last + * part - the node/entryid(12b)). Rule: Nodeid _cannot be 0_ for + * entries. Rule: nodeid of 0 is reserved only for tables(see + * earlier code which processes TC_U32_DIVISOR attribute). + * Rule: The nodeid can only be derived from the handle (and not + * htid). + * Rule: if the handle specified zero for the node id example + * 0x60000000, then pick a new nodeid from the pool of IDs + * this hash table has been allocating from. + * If OTOH it is specified (i.e for example the user passed a + * handle such as 0x60000123), then we use it generate our final + * handle which is used to uniquely identify the match entry. + */ + if (!TC_U32_NODE(handle)) { + handle = gen_new_kid(ht, htid); + } else { + handle = htid | TC_U32_NODE(handle); + err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, + handle, GFP_KERNEL); + if (err) + return err; + } + } else { + /* The user did not give us a handle; lets just generate one + * from the table's pool of nodeids. + */ handle = gen_new_kid(ht, htid); + } if (tb[TCA_U32_SEL] == NULL) { NL_SET_ERR_MSG_MOD(extack, "Selector not specified"); diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 6f3c1fb2fb44..f176afb70559 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -97,8 +97,10 @@ retry: static void em_text_destroy(struct tcf_ematch *m) { - if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) + if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) { textsearch_destroy(EM_TEXT_PRIV(m)->config); + kfree(EM_TEXT_PRIV(m)); + } } static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) diff --git a/net/sched/ematch.c b/net/sched/ematch.c index dd3b8c11a2e0..43bfb33629e9 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -255,6 +255,8 @@ static int tcf_em_validate(struct tcf_proto *tp, * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { + if (em->ops->datalen > 0) + goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 6f36df85d23d..d07146a2d0bb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -298,7 +298,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; @@ -317,7 +317,7 @@ struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; @@ -1071,11 +1071,12 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, skip: if (!ingress) { - notify_and_destroy(net, skb, n, classid, - dev->qdisc, new); + old = rtnl_dereference(dev->qdisc); if (new && !new->ops->attach) qdisc_refcount_inc(new); - dev->qdisc = new ? : &noop_qdisc; + rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); + + notify_and_destroy(net, skb, n, classid, old, new); if (new && new->ops->attach) new->ops->attach(new); @@ -1104,6 +1105,11 @@ skip: return -ENOENT; } + if (new && new->ops == &noqueue_qdisc_ops) { + NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); + return -EINVAL; + } + err = cops->graft(parent, cl, new, &old, extack); if (err) return err; @@ -1208,7 +1214,12 @@ static struct Qdisc *qdisc_create(struct net_device *dev, sch->parent = parent; if (handle == TC_H_INGRESS) { - sch->flags |= TCQ_F_INGRESS; + if (!(sch->flags & TCQ_F_INGRESS)) { + NL_SET_ERR_MSG(extack, + "Specified parent ID is reserved for ingress and clsact Qdiscs"); + err = -EINVAL; + goto err_out3; + } handle = TC_H_MAKE(TC_H_INGRESS, 0); } else { if (handle == 0) { @@ -1450,7 +1461,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); @@ -1492,10 +1503,28 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, return 0; } +static bool req_create_or_replace(struct nlmsghdr *n) +{ + return (n->nlmsg_flags & NLM_F_CREATE && + n->nlmsg_flags & NLM_F_REPLACE); +} + +static bool req_create_exclusive(struct nlmsghdr *n) +{ + return (n->nlmsg_flags & NLM_F_CREATE && + n->nlmsg_flags & NLM_F_EXCL); +} + +static bool req_change(struct nlmsghdr *n) +{ + return (!(n->nlmsg_flags & NLM_F_CREATE) && + !(n->nlmsg_flags & NLM_F_REPLACE) && + !(n->nlmsg_flags & NLM_F_EXCL)); +} + /* * Create/change qdisc. */ - static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -1539,7 +1568,7 @@ replay: q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ @@ -1568,11 +1597,20 @@ replay: NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } + if (q->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot regraft ingress or clsact Qdiscs"); + return -EINVAL; + } if (q == p || (p && check_loop(q, p, 0))) { NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; } + if (clid == TC_H_INGRESS) { + NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); + return -EINVAL; + } qdisc_refcount_inc(q); goto graft; } else { @@ -1583,27 +1621,35 @@ replay: * * We know, that some child q is already * attached to this parent and have choice: - * either to change it or to create/graft new one. + * 1) change it or 2) create/graft new one. + * If the requested qdisc kind is different + * than the existing one, then we choose graft. + * If they are the same then this is "change" + * operation - just let it fallthrough.. * * 1. We are allowed to create/graft only - * if CREATE and REPLACE flags are set. + * if the request is explicitly stating + * "please create if it doesn't exist". * - * 2. If EXCL is set, requestor wanted to say, - * that qdisc tcm_handle is not expected + * 2. If the request is to exclusive create + * then the qdisc tcm_handle is not expected * to exist, so that we choose create/graft too. * * 3. The last case is when no flags are set. + * This will happen when for example tc + * utility issues a "change" command. * Alas, it is sort of hole in API, we * cannot decide what to do unambiguously. - * For now we select create/graft, if - * user gave KIND, which does not match existing. + * For now we select create/graft. */ - if ((n->nlmsg_flags & NLM_F_CREATE) && - (n->nlmsg_flags & NLM_F_REPLACE) && - ((n->nlmsg_flags & NLM_F_EXCL) || - (tca[TCA_KIND] && - nla_strcmp(tca[TCA_KIND], q->ops->id)))) - goto create_n_graft; + if (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)) { + if (req_create_or_replace(n) || + req_create_exclusive(n)) + goto create_n_graft; + else if (req_change(n)) + goto create_n_graft2; + } } } } else { @@ -1637,6 +1683,7 @@ create_n_graft: NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); return -ENOENT; } +create_n_graft2: if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) { q = qdisc_create(dev, dev_ingress_queue(dev), p, @@ -1761,7 +1808,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, + if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), + skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; @@ -2037,7 +2085,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, } else if (qid1) { qid = qid1; } else if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. @@ -2048,7 +2096,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ @@ -2209,7 +2257,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0) + if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), + skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c deleted file mode 100644 index 6385995dc700..000000000000 --- a/net/sched/sch_atm.c +++ /dev/null @@ -1,707 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ - -/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <linux/atmdev.h> -#include <linux/atmclip.h> -#include <linux/rtnetlink.h> -#include <linux/file.h> /* for fput */ -#include <net/netlink.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> - -/* - * The ATM queuing discipline provides a framework for invoking classifiers - * (aka "filters"), which in turn select classes of this queuing discipline. - * Each class maps the flow(s) it is handling to a given VC. Multiple classes - * may share the same VC. - * - * When creating a class, VCs are specified by passing the number of the open - * socket descriptor by which the calling process references the VC. The kernel - * keeps the VC open at least until all classes using it are removed. - * - * In this file, most functions are named atm_tc_* to avoid confusion with all - * the atm_* in net/atm. This naming convention differs from what's used in the - * rest of net/sched. - * - * Known bugs: - * - sometimes messes up the IP stack - * - any manipulations besides the few operations described in the README, are - * untested and likely to crash the system - * - should lock the flow while there is data in the queue (?) - */ - -#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) - -struct atm_flow_data { - struct Qdisc_class_common common; - struct Qdisc *q; /* FIFO, TBF, etc. */ - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ - void (*old_pop)(struct atm_vcc *vcc, - struct sk_buff *skb); /* chaining */ - struct atm_qdisc_data *parent; /* parent qdisc */ - struct socket *sock; /* for closing */ - int ref; /* reference count */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_queue qstats; - struct list_head list; - struct atm_flow_data *excess; /* flow for excess traffic; - NULL to set CLP instead */ - int hdr_len; - unsigned char hdr[0]; /* header data; MUST BE LAST */ -}; - -struct atm_qdisc_data { - struct atm_flow_data link; /* unclassified skbs go here */ - struct list_head flows; /* NB: "link" is also on this - list */ - struct tasklet_struct task; /* dequeue tasklet */ -}; - -/* ------------------------- Class/flow operations ------------------------- */ - -static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - list_for_each_entry(flow, &p->flows, list) { - if (flow->common.classid == classid) - return flow; - } - return NULL; -} - -static int atm_tc_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n", - sch, p, flow, new, old); - if (list_empty(&flow->list)) - return -EINVAL; - if (!new) - new = &noop_qdisc; - *old = flow->q; - flow->q = new; - if (*old) - qdisc_reset(*old); - return 0; -} - -static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl) -{ - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow); - return flow ? flow->q : NULL; -} - -static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid) -{ - struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid); - flow = lookup_flow(sch, classid); - pr_debug("%s: flow %p\n", __func__, flow); - return (unsigned long)flow; -} - -static unsigned long atm_tc_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) -{ - struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid); - flow = lookup_flow(sch, classid); - if (flow) - flow->ref++; - pr_debug("%s: flow %p\n", __func__, flow); - return (unsigned long)flow; -} - -/* - * atm_tc_put handles all destructions, including the ones that are explicitly - * requested (atm_tc_destroy, etc.). The assumption here is that we never drop - * anything that still seems to be in use. - */ -static void atm_tc_put(struct Qdisc *sch, unsigned long cl) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - if (--flow->ref) - return; - pr_debug("atm_tc_put: destroying\n"); - list_del_init(&flow->list); - pr_debug("atm_tc_put: qdisc %p\n", flow->q); - qdisc_put(flow->q); - tcf_block_put(flow->block); - if (flow->sock) { - pr_debug("atm_tc_put: f_count %ld\n", - file_count(flow->sock->file)); - flow->vcc->pop = flow->old_pop; - sockfd_put(flow->sock); - } - if (flow->excess) - atm_tc_put(sch, (unsigned long)flow->excess); - if (flow != &p->link) - kfree(flow); - /* - * If flow == &p->link, the qdisc no longer works at this point and - * needs to be removed. (By the caller of atm_tc_put.) - */ -} - -static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb) -{ - struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; - - pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p); - VCC2FLOW(vcc)->old_pop(vcc, skb); - tasklet_schedule(&p->task); -} - -static const u8 llc_oui_ip[] = { - 0xaa, /* DSAP: non-ISO */ - 0xaa, /* SSAP: non-ISO */ - 0x03, /* Ctrl: Unnumbered Information Command PDU */ - 0x00, /* OUI: EtherType */ - 0x00, 0x00, - 0x08, 0x00 -}; /* Ethertype IP (0800) */ - -static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = { - [TCA_ATM_FD] = { .type = NLA_U32 }, - [TCA_ATM_EXCESS] = { .type = NLA_U32 }, -}; - -static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)*arg; - struct atm_flow_data *excess = NULL; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_ATM_MAX + 1]; - struct socket *sock; - int fd, error, hdr_len; - void *hdr; - - pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," - "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt); - /* - * The concept of parents doesn't apply for this qdisc. - */ - if (parent && parent != TC_H_ROOT && parent != sch->handle) - return -EINVAL; - /* - * ATM classes cannot be changed. In order to change properties of the - * ATM connection, that socket needs to be modified directly (via the - * native ATM API. In order to send a flow to a different VC, the old - * class needs to be removed and a new one added. (This may be changed - * later.) - */ - if (flow) - return -EBUSY; - if (opt == NULL) - return -EINVAL; - - error = nla_parse_nested_deprecated(tb, TCA_ATM_MAX, opt, atm_policy, - NULL); - if (error < 0) - return error; - - if (!tb[TCA_ATM_FD]) - return -EINVAL; - fd = nla_get_u32(tb[TCA_ATM_FD]); - pr_debug("atm_tc_change: fd %d\n", fd); - if (tb[TCA_ATM_HDR]) { - hdr_len = nla_len(tb[TCA_ATM_HDR]); - hdr = nla_data(tb[TCA_ATM_HDR]); - } else { - hdr_len = RFC1483LLC_LEN; - hdr = NULL; /* default LLC/SNAP for IP */ - } - if (!tb[TCA_ATM_EXCESS]) - excess = NULL; - else { - excess = (struct atm_flow_data *) - atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS])); - if (!excess) - return -ENOENT; - } - pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n", - opt->nla_type, nla_len(opt), hdr_len); - sock = sockfd_lookup(fd, &error); - if (!sock) - return error; /* f_count++ */ - pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file)); - if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { - error = -EPROTOTYPE; - goto err_out; - } - /* @@@ should check if the socket is really operational or we'll crash - on vcc->send */ - if (classid) { - if (TC_H_MAJ(classid ^ sch->handle)) { - pr_debug("atm_tc_change: classid mismatch\n"); - error = -EINVAL; - goto err_out; - } - } else { - int i; - unsigned long cl; - - for (i = 1; i < 0x8000; i++) { - classid = TC_H_MAKE(sch->handle, 0x8000 | i); - cl = atm_tc_find(sch, classid); - if (!cl) - break; - } - } - pr_debug("atm_tc_change: new id %x\n", classid); - flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL); - pr_debug("atm_tc_change: flow %p\n", flow); - if (!flow) { - error = -ENOBUFS; - goto err_out; - } - - error = tcf_block_get(&flow->block, &flow->filter_list, sch, - extack); - if (error) { - kfree(flow); - goto err_out; - } - - flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, - extack); - if (!flow->q) - flow->q = &noop_qdisc; - pr_debug("atm_tc_change: qdisc %p\n", flow->q); - flow->sock = sock; - flow->vcc = ATM_SD(sock); /* speedup */ - flow->vcc->user_back = flow; - pr_debug("atm_tc_change: vcc %p\n", flow->vcc); - flow->old_pop = flow->vcc->pop; - flow->parent = p; - flow->vcc->pop = sch_atm_pop; - flow->common.classid = classid; - flow->ref = 1; - flow->excess = excess; - list_add(&flow->list, &p->link.list); - flow->hdr_len = hdr_len; - if (hdr) - memcpy(flow->hdr, hdr, hdr_len); - else - memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip)); - *arg = (unsigned long)flow; - return 0; -err_out: - sockfd_put(sock); - return error; -} - -static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - if (list_empty(&flow->list)) - return -EINVAL; - if (rcu_access_pointer(flow->filter_list) || flow == &p->link) - return -EBUSY; - /* - * Reference count must be 2: one for "keepalive" (set at class - * creation), and one for the reference held when calling delete. - */ - if (flow->ref < 2) { - pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref); - return -EINVAL; - } - if (flow->ref > 2) - return -EBUSY; /* catch references via excess, etc. */ - atm_tc_put(sch, arg); - return 0; -} - -static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); - if (walker->stop) - return; - list_for_each_entry(flow, &p->flows, list) { - if (walker->count >= walker->skip && - walker->fn(sch, (unsigned long)flow, walker) < 0) { - walker->stop = 1; - break; - } - walker->count++; - } -} - -static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - return flow ? flow->block : p->link.block; -} - -/* --------------------------- Qdisc operations ---------------------------- */ - -static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - struct tcf_result res; - int result; - int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - - pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); - result = TC_ACT_OK; /* be nice to gcc */ - flow = NULL; - if (TC_H_MAJ(skb->priority) != sch->handle || - !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) { - struct tcf_proto *fl; - - list_for_each_entry(flow, &p->flows, list) { - fl = rcu_dereference_bh(flow->filter_list); - if (fl) { - result = tcf_classify(skb, fl, &res, true); - if (result < 0) - continue; - flow = (struct atm_flow_data *)res.class; - if (!flow) - flow = lookup_flow(sch, res.classid); - goto done; - } - } - flow = NULL; -done: - ; - } - if (!flow) { - flow = &p->link; - } else { - if (flow->vcc) - ATM_SKB(skb)->atm_options = flow->vcc->atm_options; - /*@@@ looks good ... but it's not supposed to work :-) */ -#ifdef CONFIG_NET_CLS_ACT - switch (result) { - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - __qdisc_drop(skb, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - case TC_ACT_SHOT: - __qdisc_drop(skb, to_free); - goto drop; - case TC_ACT_RECLASSIFY: - if (flow->excess) - flow = flow->excess; - else - ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP; - break; - } -#endif - } - - ret = qdisc_enqueue(skb, flow->q, to_free); - if (ret != NET_XMIT_SUCCESS) { -drop: __maybe_unused - if (net_xmit_drop_count(ret)) { - qdisc_qstats_drop(sch); - if (flow) - flow->qstats.drops++; - } - return ret; - } - /* - * Okay, this may seem weird. We pretend we've dropped the packet if - * it goes via ATM. The reason for this is that the outer qdisc - * expects to be able to q->dequeue the packet later on if we return - * success at this place. Also, sch->q.qdisc needs to reflect whether - * there is a packet egligible for dequeuing or not. Note that the - * statistics of the outer qdisc are necessarily wrong because of all - * this. There's currently no correct solution for this. - */ - if (flow == &p->link) { - sch->q.qlen++; - return NET_XMIT_SUCCESS; - } - tasklet_schedule(&p->task); - return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -} - -/* - * Dequeue packets and send them over ATM. Note that we quite deliberately - * avoid checking net_device's flow control here, simply because sch_atm - * uses its own channels, which have nothing to do with any CLIP/LANE/or - * non-ATM interfaces. - */ - -static void sch_atm_dequeue(unsigned long data) -{ - struct Qdisc *sch = (struct Qdisc *)data; - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - struct sk_buff *skb; - - pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - if (flow == &p->link) - continue; - /* - * If traffic is properly shaped, this won't generate nasty - * little bursts. Otherwise, it may ... (but that's okay) - */ - while ((skb = flow->q->ops->peek(flow->q))) { - if (!atm_may_send(flow->vcc, skb->truesize)) - break; - - skb = qdisc_dequeue_peeked(flow->q); - if (unlikely(!skb)) - break; - - qdisc_bstats_update(sch, skb); - bstats_update(&flow->bstats, skb); - pr_debug("atm_tc_dequeue: sending on class %p\n", flow); - /* remove any LL header somebody else has attached */ - skb_pull(skb, skb_network_offset(skb)); - if (skb_headroom(skb) < flow->hdr_len) { - struct sk_buff *new; - - new = skb_realloc_headroom(skb, flow->hdr_len); - dev_kfree_skb(skb); - if (!new) - continue; - skb = new; - } - pr_debug("sch_atm_dequeue: ip %p, data %p\n", - skb_network_header(skb), skb->data); - ATM_SKB(skb)->vcc = flow->vcc; - memcpy(skb_push(skb, flow->hdr_len), flow->hdr, - flow->hdr_len); - refcount_add(skb->truesize, - &sk_atm(flow->vcc)->sk_wmem_alloc); - /* atm.atm_options are already set by atm_tc_enqueue */ - flow->vcc->send(flow->vcc, skb); - } - } -} - -static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct sk_buff *skb; - - pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p); - tasklet_schedule(&p->task); - skb = qdisc_dequeue_peeked(p->link.q); - if (skb) - sch->q.qlen--; - return skb; -} - -static struct sk_buff *atm_tc_peek(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - - pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p); - - return p->link.q->ops->peek(p->link.q); -} - -static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - int err; - - pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); - INIT_LIST_HEAD(&p->flows); - INIT_LIST_HEAD(&p->link.list); - list_add(&p->link.list, &p->flows); - p->link.q = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, sch->handle, extack); - if (!p->link.q) - p->link.q = &noop_qdisc; - pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - p->link.vcc = NULL; - p->link.sock = NULL; - p->link.common.classid = sch->handle; - p->link.ref = 1; - - err = tcf_block_get(&p->link.block, &p->link.filter_list, sch, - extack); - if (err) - return err; - - tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch); - return 0; -} - -static void atm_tc_reset(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) - qdisc_reset(flow->q); - sch->q.qlen = 0; -} - -static void atm_tc_destroy(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow, *tmp; - - pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - tcf_block_put(flow->block); - flow->block = NULL; - } - - list_for_each_entry_safe(flow, tmp, &p->flows, list) { - if (flow->ref > 1) - pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref); - atm_tc_put(sch, (unsigned long)flow); - } - tasklet_kill(&p->task); -} - -static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - struct nlattr *nest; - - pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", - sch, p, flow, skb, tcm); - if (list_empty(&flow->list)) - return -EINVAL; - tcm->tcm_handle = flow->common.classid; - tcm->tcm_info = flow->q->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr)) - goto nla_put_failure; - if (flow->vcc) { - struct sockaddr_atmpvc pvc; - int state; - - memset(&pvc, 0, sizeof(pvc)); - pvc.sap_family = AF_ATMPVC; - pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; - pvc.sap_addr.vpi = flow->vcc->vpi; - pvc.sap_addr.vci = flow->vcc->vci; - if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc)) - goto nla_put_failure; - state = ATM_VF2VS(flow->vcc->flags); - if (nla_put_u32(skb, TCA_ATM_STATE, state)) - goto nla_put_failure; - } - if (flow->excess) { - if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->common.classid)) - goto nla_put_failure; - } else { - if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) - goto nla_put_failure; - } - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} -static int -atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) -{ - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &flow->bstats) < 0 || - gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) - return -1; - - return 0; -} - -static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - return 0; -} - -static const struct Qdisc_class_ops atm_class_ops = { - .graft = atm_tc_graft, - .leaf = atm_tc_leaf, - .find = atm_tc_find, - .change = atm_tc_change, - .delete = atm_tc_delete, - .walk = atm_tc_walk, - .tcf_block = atm_tc_tcf_block, - .bind_tcf = atm_tc_bind_filter, - .unbind_tcf = atm_tc_put, - .dump = atm_tc_dump_class, - .dump_stats = atm_tc_dump_class_stats, -}; - -static struct Qdisc_ops atm_qdisc_ops __read_mostly = { - .cl_ops = &atm_class_ops, - .id = "atm", - .priv_size = sizeof(struct atm_qdisc_data), - .enqueue = atm_tc_enqueue, - .dequeue = atm_tc_dequeue, - .peek = atm_tc_peek, - .init = atm_tc_init, - .reset = atm_tc_reset, - .destroy = atm_tc_destroy, - .dump = atm_tc_dump, - .owner = THIS_MODULE, -}; - -static int __init atm_init(void) -{ - return register_qdisc(&atm_qdisc_ops); -} - -static void __exit atm_exit(void) -{ - unregister_qdisc(&atm_qdisc_ops); -} - -module_init(atm_init) -module_exit(atm_exit) -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 0eb4d4a568f7..7ee0b57b3ed4 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1660,7 +1660,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct cake_sched_data *q = qdisc_priv(sch); int len = qdisc_pkt_len(skb); - int uninitialized_var(ret); + int ret; struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; @@ -2190,8 +2190,12 @@ retry: static void cake_reset(struct Qdisc *sch) { + struct cake_sched_data *q = qdisc_priv(sch); u32 c; + if (!q->tins) + return; + for (c = 0; c < CAKE_MAX_TINS; c++) cake_clear_tin(sch, c); } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c deleted file mode 100644 index e5972889cd81..000000000000 --- a/net/sched/sch_cbq.c +++ /dev/null @@ -1,1817 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/sch_cbq.c Class-Based Queueing discipline. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <net/netlink.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> - - -/* Class-Based Queueing (CBQ) algorithm. - ======================================= - - Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource - Management Models for Packet Networks", - IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 - - [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 - - [3] Sally Floyd, "Notes on Class-Based Queueing: Setting - Parameters", 1996 - - [4] Sally Floyd and Michael Speer, "Experimental Results - for Class-Based Queueing", 1998, not published. - - ----------------------------------------------------------------------- - - Algorithm skeleton was taken from NS simulator cbq.cc. - If someone wants to check this code against the LBL version, - he should take into account that ONLY the skeleton was borrowed, - the implementation is different. Particularly: - - --- The WRR algorithm is different. Our version looks more - reasonable (I hope) and works when quanta are allowed to be - less than MTU, which is always the case when real time classes - have small rates. Note, that the statement of [3] is - incomplete, delay may actually be estimated even if class - per-round allotment is less than MTU. Namely, if per-round - allotment is W*r_i, and r_1+...+r_k = r < 1 - - delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B - - In the worst case we have IntServ estimate with D = W*r+k*MTU - and C = MTU*r. The proof (if correct at all) is trivial. - - - --- It seems that cbq-2.0 is not very accurate. At least, I cannot - interpret some places, which look like wrong translations - from NS. Anyone is advised to find these differences - and explain to me, why I am wrong 8). - - --- Linux has no EOI event, so that we cannot estimate true class - idle time. Workaround is to consider the next dequeue event - as sign that previous packet is finished. This is wrong because of - internal device queueing, but on a permanently loaded link it is true. - Moreover, combined with clock integrator, this scheme looks - very close to an ideal solution. */ - -struct cbq_sched_data; - - -struct cbq_class { - struct Qdisc_class_common common; - struct cbq_class *next_alive; /* next class with backlog in this priority band */ - -/* Parameters */ - unsigned char priority; /* class priority */ - unsigned char priority2; /* priority to be used after overlimit */ - unsigned char ewma_log; /* time constant for idle time calculation */ - - u32 defmap; - - /* Link-sharing scheduler parameters */ - long maxidle; /* Class parameters: see below. */ - long offtime; - long minidle; - u32 avpkt; - struct qdisc_rate_table *R_tab; - - /* General scheduler (WRR) parameters */ - long allot; - long quantum; /* Allotment per WRR round */ - long weight; /* Relative allotment: see below */ - - struct Qdisc *qdisc; /* Ptr to CBQ discipline */ - struct cbq_class *split; /* Ptr to split node */ - struct cbq_class *share; /* Ptr to LS parent in the class tree */ - struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ - struct cbq_class *borrow; /* NULL if class is bandwidth limited; - parent otherwise */ - struct cbq_class *sibling; /* Sibling chain */ - struct cbq_class *children; /* Pointer to children chain */ - - struct Qdisc *q; /* Elementary queueing discipline */ - - -/* Variables */ - unsigned char cpriority; /* Effective priority */ - unsigned char delayed; - unsigned char level; /* level of the class in hierarchy: - 0 for leaf classes, and maximal - level of children + 1 for nodes. - */ - - psched_time_t last; /* Last end of service */ - psched_time_t undertime; - long avgidle; - long deficit; /* Saved deficit for WRR */ - psched_time_t penalized; - struct gnet_stats_basic_packed bstats; - struct gnet_stats_queue qstats; - struct net_rate_estimator __rcu *rate_est; - struct tc_cbq_xstats xstats; - - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - - int filters; - - struct cbq_class *defaults[TC_PRIO_MAX + 1]; -}; - -struct cbq_sched_data { - struct Qdisc_class_hash clhash; /* Hash table of all classes */ - int nclasses[TC_CBQ_MAXPRIO + 1]; - unsigned int quanta[TC_CBQ_MAXPRIO + 1]; - - struct cbq_class link; - - unsigned int activemask; - struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes - with backlog */ - -#ifdef CONFIG_NET_CLS_ACT - struct cbq_class *rx_class; -#endif - struct cbq_class *tx_class; - struct cbq_class *tx_borrowed; - int tx_len; - psched_time_t now; /* Cached timestamp */ - unsigned int pmask; - - struct hrtimer delay_timer; - struct qdisc_watchdog watchdog; /* Watchdog timer, - started when CBQ has - backlog, but cannot - transmit just now */ - psched_tdiff_t wd_expires; - int toplevel; - u32 hgenerator; -}; - - -#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len) - -static inline struct cbq_class * -cbq_class_lookup(struct cbq_sched_data *q, u32 classid) -{ - struct Qdisc_class_common *clc; - - clc = qdisc_class_find(&q->clhash, classid); - if (clc == NULL) - return NULL; - return container_of(clc, struct cbq_class, common); -} - -#ifdef CONFIG_NET_CLS_ACT - -static struct cbq_class * -cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) -{ - struct cbq_class *cl; - - for (cl = this->tparent; cl; cl = cl->tparent) { - struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT]; - - if (new != NULL && new != this) - return new; - } - return NULL; -} - -#endif - -/* Classify packet. The procedure is pretty complicated, but - * it allows us to combine link sharing and priority scheduling - * transparently. - * - * Namely, you can put link sharing rules (f.e. route based) at root of CBQ, - * so that it resolves to split nodes. Then packets are classified - * by logical priority, or a more specific classifier may be attached - * to the split node. - */ - -static struct cbq_class * -cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *head = &q->link; - struct cbq_class **defmap; - struct cbq_class *cl = NULL; - u32 prio = skb->priority; - struct tcf_proto *fl; - struct tcf_result res; - - /* - * Step 1. If skb->priority points to one of our classes, use it. - */ - if (TC_H_MAJ(prio ^ sch->handle) == 0 && - (cl = cbq_class_lookup(q, prio)) != NULL) - return cl; - - *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - for (;;) { - int result = 0; - defmap = head->defaults; - - fl = rcu_dereference_bh(head->filter_list); - /* - * Step 2+n. Apply classifier. - */ - result = tcf_classify(skb, fl, &res, true); - if (!fl || result < 0) - goto fallback; - - cl = (void *)res.class; - if (!cl) { - if (TC_H_MAJ(res.classid)) - cl = cbq_class_lookup(q, res.classid); - else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) - cl = defmap[TC_PRIO_BESTEFFORT]; - - if (cl == NULL) - goto fallback; - } - if (cl->level >= head->level) - goto fallback; -#ifdef CONFIG_NET_CLS_ACT - switch (result) { - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ - case TC_ACT_SHOT: - return NULL; - case TC_ACT_RECLASSIFY: - return cbq_reclassify(skb, cl); - } -#endif - if (cl->level == 0) - return cl; - - /* - * Step 3+n. If classifier selected a link sharing class, - * apply agency specific classifier. - * Repeat this procdure until we hit a leaf node. - */ - head = cl; - } - -fallback: - cl = head; - - /* - * Step 4. No success... - */ - if (TC_H_MAJ(prio) == 0 && - !(cl = head->defaults[prio & TC_PRIO_MAX]) && - !(cl = head->defaults[TC_PRIO_BESTEFFORT])) - return head; - - return cl; -} - -/* - * A packet has just been enqueued on the empty class. - * cbq_activate_class adds it to the tail of active class list - * of its priority band. - */ - -static inline void cbq_activate_class(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - int prio = cl->cpriority; - struct cbq_class *cl_tail; - - cl_tail = q->active[prio]; - q->active[prio] = cl; - - if (cl_tail != NULL) { - cl->next_alive = cl_tail->next_alive; - cl_tail->next_alive = cl; - } else { - cl->next_alive = cl; - q->activemask |= (1<<prio); - } -} - -/* - * Unlink class from active chain. - * Note that this same procedure is done directly in cbq_dequeue* - * during round-robin procedure. - */ - -static void cbq_deactivate_class(struct cbq_class *this) -{ - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - int prio = this->cpriority; - struct cbq_class *cl; - struct cbq_class *cl_prev = q->active[prio]; - - do { - cl = cl_prev->next_alive; - if (cl == this) { - cl_prev->next_alive = cl->next_alive; - cl->next_alive = NULL; - - if (cl == q->active[prio]) { - q->active[prio] = cl_prev; - if (cl == q->active[prio]) { - q->active[prio] = NULL; - q->activemask &= ~(1<<prio); - return; - } - } - return; - } - } while ((cl_prev = cl) != q->active[prio]); -} - -static void -cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) -{ - int toplevel = q->toplevel; - - if (toplevel > cl->level) { - psched_time_t now = psched_get_time(); - - do { - if (cl->undertime < now) { - q->toplevel = cl->level; - return; - } - } while ((cl = cl->borrow) != NULL && toplevel > cl->level); - } -} - -static int -cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - int uninitialized_var(ret); - struct cbq_class *cl = cbq_classify(skb, sch, &ret); - -#ifdef CONFIG_NET_CLS_ACT - q->rx_class = cl; -#endif - if (cl == NULL) { - if (ret & __NET_XMIT_BYPASS) - qdisc_qstats_drop(sch); - __qdisc_drop(skb, to_free); - return ret; - } - - ret = qdisc_enqueue(skb, cl->q, to_free); - if (ret == NET_XMIT_SUCCESS) { - sch->q.qlen++; - cbq_mark_toplevel(q, cl); - if (!cl->next_alive) - cbq_activate_class(cl); - return ret; - } - - if (net_xmit_drop_count(ret)) { - qdisc_qstats_drop(sch); - cbq_mark_toplevel(q, cl); - cl->qstats.drops++; - } - return ret; -} - -/* Overlimit action: penalize leaf class by adding offtime */ -static void cbq_overlimit(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = cl->undertime - q->now; - - if (!cl->delayed) { - delay += cl->offtime; - - /* - * Class goes to sleep, so that it will have no - * chance to work avgidle. Let's forgive it 8) - * - * BTW cbq-2.0 has a crap in this - * place, apparently they forgot to shift it by cl->ewma_log. - */ - if (cl->avgidle < 0) - delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); - if (cl->avgidle < cl->minidle) - cl->avgidle = cl->minidle; - if (delay <= 0) - delay = 1; - cl->undertime = q->now + delay; - - cl->xstats.overactions++; - cl->delayed = 1; - } - if (q->wd_expires == 0 || q->wd_expires > delay) - q->wd_expires = delay; - - /* Dirty work! We must schedule wakeups based on - * real available rate, rather than leaf rate, - * which may be tiny (even zero). - */ - if (q->toplevel == TC_CBQ_MAXLEVEL) { - struct cbq_class *b; - psched_tdiff_t base_delay = q->wd_expires; - - for (b = cl->borrow; b; b = b->borrow) { - delay = b->undertime - q->now; - if (delay < base_delay) { - if (delay <= 0) - delay = 1; - base_delay = delay; - } - } - - q->wd_expires = base_delay; - } -} - -static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, - psched_time_t now) -{ - struct cbq_class *cl; - struct cbq_class *cl_prev = q->active[prio]; - psched_time_t sched = now; - - if (cl_prev == NULL) - return 0; - - do { - cl = cl_prev->next_alive; - if (now - cl->penalized > 0) { - cl_prev->next_alive = cl->next_alive; - cl->next_alive = NULL; - cl->cpriority = cl->priority; - cl->delayed = 0; - cbq_activate_class(cl); - - if (cl == q->active[prio]) { - q->active[prio] = cl_prev; - if (cl == q->active[prio]) { - q->active[prio] = NULL; - return 0; - } - } - - cl = cl_prev->next_alive; - } else if (sched - cl->penalized > 0) - sched = cl->penalized; - } while ((cl_prev = cl) != q->active[prio]); - - return sched - now; -} - -static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) -{ - struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, - delay_timer); - struct Qdisc *sch = q->watchdog.qdisc; - psched_time_t now; - psched_tdiff_t delay = 0; - unsigned int pmask; - - now = psched_get_time(); - - pmask = q->pmask; - q->pmask = 0; - - while (pmask) { - int prio = ffz(~pmask); - psched_tdiff_t tmp; - - pmask &= ~(1<<prio); - - tmp = cbq_undelay_prio(q, prio, now); - if (tmp > 0) { - q->pmask |= 1<<prio; - if (tmp < delay || delay == 0) - delay = tmp; - } - } - - if (delay) { - ktime_t time; - - time = 0; - time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); - hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED); - } - - __netif_schedule(qdisc_root(sch)); - return HRTIMER_NORESTART; -} - -/* - * It is mission critical procedure. - * - * We "regenerate" toplevel cutoff, if transmitting class - * has backlog and it is not regulated. It is not part of - * original CBQ description, but looks more reasonable. - * Probably, it is wrong. This question needs further investigation. - */ - -static inline void -cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, - struct cbq_class *borrowed) -{ - if (cl && q->toplevel >= borrowed->level) { - if (cl->q->q.qlen > 1) { - do { - if (borrowed->undertime == PSCHED_PASTPERFECT) { - q->toplevel = borrowed->level; - return; - } - } while ((borrowed = borrowed->borrow) != NULL); - } -#if 0 - /* It is not necessary now. Uncommenting it - will save CPU cycles, but decrease fairness. - */ - q->toplevel = TC_CBQ_MAXLEVEL; -#endif - } -} - -static void -cbq_update(struct cbq_sched_data *q) -{ - struct cbq_class *this = q->tx_class; - struct cbq_class *cl = this; - int len = q->tx_len; - psched_time_t now; - - q->tx_class = NULL; - /* Time integrator. We calculate EOS time - * by adding expected packet transmission time. - */ - now = q->now + L2T(&q->link, len); - - for ( ; cl; cl = cl->share) { - long avgidle = cl->avgidle; - long idle; - - cl->bstats.packets++; - cl->bstats.bytes += len; - - /* - * (now - last) is total time between packet right edges. - * (last_pktlen/rate) is "virtual" busy time, so that - * - * idle = (now - last) - last_pktlen/rate - */ - - idle = now - cl->last; - if ((unsigned long)idle > 128*1024*1024) { - avgidle = cl->maxidle; - } else { - idle -= L2T(cl, len); - - /* true_avgidle := (1-W)*true_avgidle + W*idle, - * where W=2^{-ewma_log}. But cl->avgidle is scaled: - * cl->avgidle == true_avgidle/W, - * hence: - */ - avgidle += idle - (avgidle>>cl->ewma_log); - } - - if (avgidle <= 0) { - /* Overlimit or at-limit */ - - if (avgidle < cl->minidle) - avgidle = cl->minidle; - - cl->avgidle = avgidle; - - /* Calculate expected time, when this class - * will be allowed to send. - * It will occur, when: - * (1-W)*true_avgidle + W*delay = 0, i.e. - * idle = (1/W - 1)*(-true_avgidle) - * or - * idle = (1 - W)*(-cl->avgidle); - */ - idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); - - /* - * That is not all. - * To maintain the rate allocated to the class, - * we add to undertime virtual clock, - * necessary to complete transmitted packet. - * (len/phys_bandwidth has been already passed - * to the moment of cbq_update) - */ - - idle -= L2T(&q->link, len); - idle += L2T(cl, len); - - cl->undertime = now + idle; - } else { - /* Underlimit */ - - cl->undertime = PSCHED_PASTPERFECT; - if (avgidle > cl->maxidle) - cl->avgidle = cl->maxidle; - else - cl->avgidle = avgidle; - } - if ((s64)(now - cl->last) > 0) - cl->last = now; - } - - cbq_update_toplevel(q, this, q->tx_borrowed); -} - -static inline struct cbq_class * -cbq_under_limit(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *this_cl = cl; - - if (cl->tparent == NULL) - return cl; - - if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { - cl->delayed = 0; - return cl; - } - - do { - /* It is very suspicious place. Now overlimit - * action is generated for not bounded classes - * only if link is completely congested. - * Though it is in agree with ancestor-only paradigm, - * it looks very stupid. Particularly, - * it means that this chunk of code will either - * never be called or result in strong amplification - * of burstiness. Dangerous, silly, and, however, - * no another solution exists. - */ - cl = cl->borrow; - if (!cl) { - this_cl->qstats.overlimits++; - cbq_overlimit(this_cl); - return NULL; - } - if (cl->level > q->toplevel) - return NULL; - } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); - - cl->delayed = 0; - return cl; -} - -static inline struct sk_buff * -cbq_dequeue_prio(struct Qdisc *sch, int prio) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl_tail, *cl_prev, *cl; - struct sk_buff *skb; - int deficit; - - cl_tail = cl_prev = q->active[prio]; - cl = cl_prev->next_alive; - - do { - deficit = 0; - - /* Start round */ - do { - struct cbq_class *borrow = cl; - - if (cl->q->q.qlen && - (borrow = cbq_under_limit(cl)) == NULL) - goto skip_class; - - if (cl->deficit <= 0) { - /* Class exhausted its allotment per - * this round. Switch to the next one. - */ - deficit = 1; - cl->deficit += cl->quantum; - goto next_class; - } - - skb = cl->q->dequeue(cl->q); - - /* Class did not give us any skb :-( - * It could occur even if cl->q->q.qlen != 0 - * f.e. if cl->q == "tbf" - */ - if (skb == NULL) - goto skip_class; - - cl->deficit -= qdisc_pkt_len(skb); - q->tx_class = cl; - q->tx_borrowed = borrow; - if (borrow != cl) { -#ifndef CBQ_XSTATS_BORROWS_BYTES - borrow->xstats.borrows++; - cl->xstats.borrows++; -#else - borrow->xstats.borrows += qdisc_pkt_len(skb); - cl->xstats.borrows += qdisc_pkt_len(skb); -#endif - } - q->tx_len = qdisc_pkt_len(skb); - - if (cl->deficit <= 0) { - q->active[prio] = cl; - cl = cl->next_alive; - cl->deficit += cl->quantum; - } - return skb; - -skip_class: - if (cl->q->q.qlen == 0 || prio != cl->cpriority) { - /* Class is empty or penalized. - * Unlink it from active chain. - */ - cl_prev->next_alive = cl->next_alive; - cl->next_alive = NULL; - - /* Did cl_tail point to it? */ - if (cl == cl_tail) { - /* Repair it! */ - cl_tail = cl_prev; - - /* Was it the last class in this band? */ - if (cl == cl_tail) { - /* Kill the band! */ - q->active[prio] = NULL; - q->activemask &= ~(1<<prio); - if (cl->q->q.qlen) - cbq_activate_class(cl); - return NULL; - } - - q->active[prio] = cl_tail; - } - if (cl->q->q.qlen) - cbq_activate_class(cl); - - cl = cl_prev; - } - -next_class: - cl_prev = cl; - cl = cl->next_alive; - } while (cl_prev != cl_tail); - } while (deficit); - - q->active[prio] = cl_prev; - - return NULL; -} - -static inline struct sk_buff * -cbq_dequeue_1(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct sk_buff *skb; - unsigned int activemask; - - activemask = q->activemask & 0xFF; - while (activemask) { - int prio = ffz(~activemask); - activemask &= ~(1<<prio); - skb = cbq_dequeue_prio(sch, prio); - if (skb) - return skb; - } - return NULL; -} - -static struct sk_buff * -cbq_dequeue(struct Qdisc *sch) -{ - struct sk_buff *skb; - struct cbq_sched_data *q = qdisc_priv(sch); - psched_time_t now; - - now = psched_get_time(); - - if (q->tx_class) - cbq_update(q); - - q->now = now; - - for (;;) { - q->wd_expires = 0; - - skb = cbq_dequeue_1(sch); - if (skb) { - qdisc_bstats_update(sch, skb); - sch->q.qlen--; - return skb; - } - - /* All the classes are overlimit. - * - * It is possible, if: - * - * 1. Scheduler is empty. - * 2. Toplevel cutoff inhibited borrowing. - * 3. Root class is overlimit. - * - * Reset 2d and 3d conditions and retry. - * - * Note, that NS and cbq-2.0 are buggy, peeking - * an arbitrary class is appropriate for ancestor-only - * sharing, but not for toplevel algorithm. - * - * Our version is better, but slower, because it requires - * two passes, but it is unavoidable with top-level sharing. - */ - - if (q->toplevel == TC_CBQ_MAXLEVEL && - q->link.undertime == PSCHED_PASTPERFECT) - break; - - q->toplevel = TC_CBQ_MAXLEVEL; - q->link.undertime = PSCHED_PASTPERFECT; - } - - /* No packets in scheduler or nobody wants to give them to us :-( - * Sigh... start watchdog timer in the last case. - */ - - if (sch->q.qlen) { - qdisc_qstats_overlimit(sch); - if (q->wd_expires) - qdisc_watchdog_schedule(&q->watchdog, - now + q->wd_expires); - } - return NULL; -} - -/* CBQ class maintanance routines */ - -static void cbq_adjust_levels(struct cbq_class *this) -{ - if (this == NULL) - return; - - do { - int level = 0; - struct cbq_class *cl; - - cl = this->children; - if (cl) { - do { - if (cl->level > level) - level = cl->level; - } while ((cl = cl->sibling) != this->children); - } - this->level = level + 1; - } while ((this = this->tparent) != NULL); -} - -static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) -{ - struct cbq_class *cl; - unsigned int h; - - if (q->quanta[prio] == 0) - return; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - /* BUGGGG... Beware! This expression suffer of - * arithmetic overflows! - */ - if (cl->priority == prio) { - cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ - q->quanta[prio]; - } - if (cl->quantum <= 0 || - cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { - pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", - cl->common.classid, cl->quantum); - cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; - } - } - } -} - -static void cbq_sync_defmap(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *split = cl->split; - unsigned int h; - int i; - - if (split == NULL) - return; - - for (i = 0; i <= TC_PRIO_MAX; i++) { - if (split->defaults[i] == cl && !(cl->defmap & (1<<i))) - split->defaults[i] = NULL; - } - - for (i = 0; i <= TC_PRIO_MAX; i++) { - int level = split->level; - - if (split->defaults[i]) - continue; - - for (h = 0; h < q->clhash.hashsize; h++) { - struct cbq_class *c; - - hlist_for_each_entry(c, &q->clhash.hash[h], - common.hnode) { - if (c->split == split && c->level < level && - c->defmap & (1<<i)) { - split->defaults[i] = c; - level = c->level; - } - } - } - } -} - -static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) -{ - struct cbq_class *split = NULL; - - if (splitid == 0) { - split = cl->split; - if (!split) - return; - splitid = split->common.classid; - } - - if (split == NULL || split->common.classid != splitid) { - for (split = cl->tparent; split; split = split->tparent) - if (split->common.classid == splitid) - break; - } - - if (split == NULL) - return; - - if (cl->split != split) { - cl->defmap = 0; - cbq_sync_defmap(cl); - cl->split = split; - cl->defmap = def & mask; - } else - cl->defmap = (cl->defmap & ~mask) | (def & mask); - - cbq_sync_defmap(cl); -} - -static void cbq_unlink_class(struct cbq_class *this) -{ - struct cbq_class *cl, **clp; - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - - qdisc_class_hash_remove(&q->clhash, &this->common); - - if (this->tparent) { - clp = &this->sibling; - cl = *clp; - do { - if (cl == this) { - *clp = cl->sibling; - break; - } - clp = &cl->sibling; - } while ((cl = *clp) != this->sibling); - - if (this->tparent->children == this) { - this->tparent->children = this->sibling; - if (this->sibling == this) - this->tparent->children = NULL; - } - } else { - WARN_ON(this->sibling != this); - } -} - -static void cbq_link_class(struct cbq_class *this) -{ - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - struct cbq_class *parent = this->tparent; - - this->sibling = this; - qdisc_class_hash_insert(&q->clhash, &this->common); - - if (parent == NULL) - return; - - if (parent->children == NULL) { - parent->children = this; - } else { - this->sibling = parent->children->sibling; - parent->children->sibling = this; - } -} - -static void -cbq_reset(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl; - int prio; - unsigned int h; - - q->activemask = 0; - q->pmask = 0; - q->tx_class = NULL; - q->tx_borrowed = NULL; - qdisc_watchdog_cancel(&q->watchdog); - hrtimer_cancel(&q->delay_timer); - q->toplevel = TC_CBQ_MAXLEVEL; - q->now = psched_get_time(); - - for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) - q->active[prio] = NULL; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - qdisc_reset(cl->q); - - cl->next_alive = NULL; - cl->undertime = PSCHED_PASTPERFECT; - cl->avgidle = cl->maxidle; - cl->deficit = cl->quantum; - cl->cpriority = cl->priority; - } - } - sch->q.qlen = 0; -} - - -static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) -{ - if (lss->change & TCF_CBQ_LSS_FLAGS) { - cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; - cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; - } - if (lss->change & TCF_CBQ_LSS_EWMA) - cl->ewma_log = lss->ewma_log; - if (lss->change & TCF_CBQ_LSS_AVPKT) - cl->avpkt = lss->avpkt; - if (lss->change & TCF_CBQ_LSS_MINIDLE) - cl->minidle = -(long)lss->minidle; - if (lss->change & TCF_CBQ_LSS_MAXIDLE) { - cl->maxidle = lss->maxidle; - cl->avgidle = lss->maxidle; - } - if (lss->change & TCF_CBQ_LSS_OFFTIME) - cl->offtime = lss->offtime; - return 0; -} - -static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) -{ - q->nclasses[cl->priority]--; - q->quanta[cl->priority] -= cl->weight; - cbq_normalize_quanta(q, cl->priority); -} - -static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) -{ - q->nclasses[cl->priority]++; - q->quanta[cl->priority] += cl->weight; - cbq_normalize_quanta(q, cl->priority); -} - -static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - - if (wrr->allot) - cl->allot = wrr->allot; - if (wrr->weight) - cl->weight = wrr->weight; - if (wrr->priority) { - cl->priority = wrr->priority - 1; - cl->cpriority = cl->priority; - if (cl->priority >= cl->priority2) - cl->priority2 = TC_CBQ_MAXPRIO - 1; - } - - cbq_addprio(q, cl); - return 0; -} - -static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) -{ - cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); - return 0; -} - -static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { - [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, - [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, - [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, - [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, - [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, - [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, - [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, -}; - -static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], - struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - int err; - - if (!opt) { - NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, - cbq_policy, extack); - if (err < 0) - return err; - - if (tb[TCA_CBQ_WRROPT]) { - const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]); - - if (wrr->priority > TC_CBQ_MAXPRIO) { - NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO"); - err = -EINVAL; - } - } - return err; -} - -static int cbq_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_CBQ_MAX + 1]; - struct tc_ratespec *r; - int err; - - qdisc_watchdog_init(&q->watchdog, sch); - hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - q->delay_timer.function = cbq_undelay; - - err = cbq_opt_parse(tb, opt, extack); - if (err < 0) - return err; - - if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) { - NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete"); - return -EINVAL; - } - - r = nla_data(tb[TCA_CBQ_RATE]); - - q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack); - if (!q->link.R_tab) - return -EINVAL; - - err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack); - if (err) - goto put_rtab; - - err = qdisc_class_hash_init(&q->clhash); - if (err < 0) - goto put_block; - - q->link.sibling = &q->link; - q->link.common.classid = sch->handle; - q->link.qdisc = sch; - q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle, NULL); - if (!q->link.q) - q->link.q = &noop_qdisc; - else - qdisc_hash_add(q->link.q, true); - - q->link.priority = TC_CBQ_MAXPRIO - 1; - q->link.priority2 = TC_CBQ_MAXPRIO - 1; - q->link.cpriority = TC_CBQ_MAXPRIO - 1; - q->link.allot = psched_mtu(qdisc_dev(sch)); - q->link.quantum = q->link.allot; - q->link.weight = q->link.R_tab->rate.rate; - - q->link.ewma_log = TC_CBQ_DEF_EWMA; - q->link.avpkt = q->link.allot/2; - q->link.minidle = -0x7FFFFFFF; - - q->toplevel = TC_CBQ_MAXLEVEL; - q->now = psched_get_time(); - - cbq_link_class(&q->link); - - if (tb[TCA_CBQ_LSSOPT]) - cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); - - cbq_addprio(q, &q->link); - return 0; - -put_block: - tcf_block_put(q->link.block); - -put_rtab: - qdisc_put_rtab(q->link.R_tab); - return err; -} - -static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - - if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_lssopt opt; - - opt.flags = 0; - if (cl->borrow == NULL) - opt.flags |= TCF_CBQ_LSS_BOUNDED; - if (cl->share == NULL) - opt.flags |= TCF_CBQ_LSS_ISOLATED; - opt.ewma_log = cl->ewma_log; - opt.level = cl->level; - opt.avpkt = cl->avpkt; - opt.maxidle = cl->maxidle; - opt.minidle = (u32)(-cl->minidle); - opt.offtime = cl->offtime; - opt.change = ~0; - if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_wrropt opt; - - memset(&opt, 0, sizeof(opt)); - opt.flags = 0; - opt.allot = cl->allot; - opt.priority = cl->priority + 1; - opt.cpriority = cl->cpriority + 1; - opt.weight = cl->weight; - if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_fopt opt; - - if (cl->split || cl->defmap) { - opt.split = cl->split ? cl->split->common.classid : 0; - opt.defmap = cl->defmap; - opt.defchange = ~0; - if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt)) - goto nla_put_failure; - } - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) -{ - if (cbq_dump_lss(skb, cl) < 0 || - cbq_dump_rate(skb, cl) < 0 || - cbq_dump_wrr(skb, cl) < 0 || - cbq_dump_fopt(skb, cl) < 0) - return -1; - return 0; -} - -static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct nlattr *nest; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - if (cbq_dump_attr(skb, &q->link) < 0) - goto nla_put_failure; - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static int -cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - q->link.xstats.avgidle = q->link.avgidle; - return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); -} - -static int -cbq_dump_class(struct Qdisc *sch, unsigned long arg, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - struct nlattr *nest; - - if (cl->tparent) - tcm->tcm_parent = cl->tparent->common.classid; - else - tcm->tcm_parent = TC_H_ROOT; - tcm->tcm_handle = cl->common.classid; - tcm->tcm_info = cl->q->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - if (cbq_dump_attr(skb, cl) < 0) - goto nla_put_failure; - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static int -cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - __u32 qlen; - - cl->xstats.avgidle = cl->avgidle; - cl->xstats.undertime = 0; - qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog); - - if (cl->undertime != PSCHED_PASTPERFECT) - cl->xstats.undertime = cl->undertime - q->now; - - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) - return -1; - - return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); -} - -static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old, struct netlink_ext_ack *extack) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->common.classid, extack); - if (new == NULL) - return -ENOBUFS; - } - - *old = qdisc_replace(sch, new, &cl->q); - return 0; -} - -static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - return cl->q; -} - -static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - cbq_deactivate_class(cl); -} - -static unsigned long cbq_find(struct Qdisc *sch, u32 classid) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - return (unsigned long)cbq_class_lookup(q, classid); -} - -static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - WARN_ON(cl->filters); - - tcf_block_put(cl->block); - qdisc_put(cl->q); - qdisc_put_rtab(cl->R_tab); - gen_kill_estimator(&cl->rate_est); - if (cl != &q->link) - kfree(cl); -} - -static void cbq_destroy(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct hlist_node *next; - struct cbq_class *cl; - unsigned int h; - -#ifdef CONFIG_NET_CLS_ACT - q->rx_class = NULL; -#endif - /* - * Filters must be destroyed first because we don't destroy the - * classes from root to leafs which means that filters can still - * be bound to classes which have been destroyed already. --TGR '04 - */ - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - tcf_block_put(cl->block); - cl->block = NULL; - } - } - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], - common.hnode) - cbq_destroy_class(sch, cl); - } - qdisc_class_hash_destroy(&q->clhash); -} - -static int -cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, - unsigned long *arg, struct netlink_ext_ack *extack) -{ - int err; - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)*arg; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_CBQ_MAX + 1]; - struct cbq_class *parent; - struct qdisc_rate_table *rtab = NULL; - - err = cbq_opt_parse(tb, opt, extack); - if (err < 0) - return err; - - if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) { - NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params"); - return -EOPNOTSUPP; - } - - if (cl) { - /* Check parent */ - if (parentid) { - if (cl->tparent && - cl->tparent->common.classid != parentid) { - NL_SET_ERR_MSG(extack, "Invalid parent id"); - return -EINVAL; - } - if (!cl->tparent && parentid != TC_H_ROOT) { - NL_SET_ERR_MSG(extack, "Parent must be root"); - return -EINVAL; - } - } - - if (tb[TCA_CBQ_RATE]) { - rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), - tb[TCA_CBQ_RTAB], extack); - if (rtab == NULL) - return -EINVAL; - } - - if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); - qdisc_put_rtab(rtab); - return err; - } - } - - /* Change class parameters */ - sch_tree_lock(sch); - - if (cl->next_alive != NULL) - cbq_deactivate_class(cl); - - if (rtab) { - qdisc_put_rtab(cl->R_tab); - cl->R_tab = rtab; - } - - if (tb[TCA_CBQ_LSSOPT]) - cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - - if (tb[TCA_CBQ_WRROPT]) { - cbq_rmprio(q, cl); - cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); - } - - if (tb[TCA_CBQ_FOPT]) - cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); - - if (cl->q->q.qlen) - cbq_activate_class(cl); - - sch_tree_unlock(sch); - - return 0; - } - - if (parentid == TC_H_ROOT) - return -EINVAL; - - if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) { - NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing"); - return -EINVAL; - } - - rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB], - extack); - if (rtab == NULL) - return -EINVAL; - - if (classid) { - err = -EINVAL; - if (TC_H_MAJ(classid ^ sch->handle) || - cbq_class_lookup(q, classid)) { - NL_SET_ERR_MSG(extack, "Specified class not found"); - goto failure; - } - } else { - int i; - classid = TC_H_MAKE(sch->handle, 0x8000); - - for (i = 0; i < 0x8000; i++) { - if (++q->hgenerator >= 0x8000) - q->hgenerator = 1; - if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) - break; - } - err = -ENOSR; - if (i >= 0x8000) { - NL_SET_ERR_MSG(extack, "Unable to generate classid"); - goto failure; - } - classid = classid|q->hgenerator; - } - - parent = &q->link; - if (parentid) { - parent = cbq_class_lookup(q, parentid); - err = -EINVAL; - if (!parent) { - NL_SET_ERR_MSG(extack, "Failed to find parentid"); - goto failure; - } - } - - err = -ENOBUFS; - cl = kzalloc(sizeof(*cl), GFP_KERNEL); - if (cl == NULL) - goto failure; - - err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); - if (err) { - kfree(cl); - goto failure; - } - - if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); - tcf_block_put(cl->block); - kfree(cl); - goto failure; - } - } - - cl->R_tab = rtab; - rtab = NULL; - cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, - NULL); - if (!cl->q) - cl->q = &noop_qdisc; - else - qdisc_hash_add(cl->q, true); - - cl->common.classid = classid; - cl->tparent = parent; - cl->qdisc = sch; - cl->allot = parent->allot; - cl->quantum = cl->allot; - cl->weight = cl->R_tab->rate.rate; - - sch_tree_lock(sch); - cbq_link_class(cl); - cl->borrow = cl->tparent; - if (cl->tparent != &q->link) - cl->share = cl->tparent; - cbq_adjust_levels(parent); - cl->minidle = -0x7FFFFFFF; - cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); - if (cl->ewma_log == 0) - cl->ewma_log = q->link.ewma_log; - if (cl->maxidle == 0) - cl->maxidle = q->link.maxidle; - if (cl->avpkt == 0) - cl->avpkt = q->link.avpkt; - if (tb[TCA_CBQ_FOPT]) - cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); - sch_tree_unlock(sch); - - qdisc_class_hash_grow(sch, &q->clhash); - - *arg = (unsigned long)cl; - return 0; - -failure: - qdisc_put_rtab(rtab); - return err; -} - -static int cbq_delete(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - - if (cl->filters || cl->children || cl == &q->link) - return -EBUSY; - - sch_tree_lock(sch); - - qdisc_purge_queue(cl->q); - - if (cl->next_alive) - cbq_deactivate_class(cl); - - if (q->tx_borrowed == cl) - q->tx_borrowed = q->tx_class; - if (q->tx_class == cl) { - q->tx_class = NULL; - q->tx_borrowed = NULL; - } -#ifdef CONFIG_NET_CLS_ACT - if (q->rx_class == cl) - q->rx_class = NULL; -#endif - - cbq_unlink_class(cl); - cbq_adjust_levels(cl->tparent); - cl->defmap = 0; - cbq_sync_defmap(cl); - - cbq_rmprio(q, cl); - sch_tree_unlock(sch); - - cbq_destroy_class(sch, cl); - return 0; -} - -static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - - if (cl == NULL) - cl = &q->link; - - return cl->block; -} - -static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, - u32 classid) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *p = (struct cbq_class *)parent; - struct cbq_class *cl = cbq_class_lookup(q, classid); - - if (cl) { - if (p && p->level <= cl->level) - return 0; - cl->filters++; - return (unsigned long)cl; - } - return 0; -} - -static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - cl->filters--; -} - -static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl; - unsigned int h; - - if (arg->stop) - return; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, (unsigned long)cl, arg) < 0) { - arg->stop = 1; - return; - } - arg->count++; - } - } -} - -static const struct Qdisc_class_ops cbq_class_ops = { - .graft = cbq_graft, - .leaf = cbq_leaf, - .qlen_notify = cbq_qlen_notify, - .find = cbq_find, - .change = cbq_change_class, - .delete = cbq_delete, - .walk = cbq_walk, - .tcf_block = cbq_tcf_block, - .bind_tcf = cbq_bind_filter, - .unbind_tcf = cbq_unbind_filter, - .dump = cbq_dump_class, - .dump_stats = cbq_dump_class_stats, -}; - -static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { - .next = NULL, - .cl_ops = &cbq_class_ops, - .id = "cbq", - .priv_size = sizeof(struct cbq_sched_data), - .enqueue = cbq_enqueue, - .dequeue = cbq_dequeue, - .peek = qdisc_peek_dequeued, - .init = cbq_init, - .reset = cbq_reset, - .destroy = cbq_destroy, - .change = NULL, - .dump = cbq_dump, - .dump_stats = cbq_dump_stats, - .owner = THIS_MODULE, -}; - -static int __init cbq_module_init(void) -{ - return register_qdisc(&cbq_qdisc_ops); -} -static void __exit cbq_module_exit(void) -{ - unregister_qdisc(&cbq_qdisc_ops); -} -module_init(cbq_module_init) -module_exit(cbq_module_exit) -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c deleted file mode 100644 index 76ed1a05ded2..000000000000 --- a/net/sched/sch_dsmark.c +++ /dev/null @@ -1,523 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* net/sched/sch_dsmark.c - Differentiated Services field marker */ - -/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ - - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/bitops.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <asm/byteorder.h> - -/* - * classid class marking - * ------- ----- ------- - * n/a 0 n/a - * x:0 1 use entry [0] - * ... ... ... - * x:y y>0 y+1 use entry [y] - * ... ... ... - * x:indices-1 indices use entry [indices-1] - * ... ... ... - * x:y y+1 use entry [y & (indices-1)] - * ... ... ... - * 0xffff 0x10000 use entry [indices-1] - */ - - -#define NO_DEFAULT_INDEX (1 << 16) - -struct mask_value { - u8 mask; - u8 value; -}; - -struct dsmark_qdisc_data { - struct Qdisc *q; - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - struct mask_value *mv; - u16 indices; - u8 set_tc_index; - u32 default_index; /* index range is 0...0xffff */ -#define DSMARK_EMBEDDED_SZ 16 - struct mask_value embedded[DSMARK_EMBEDDED_SZ]; -}; - -static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) -{ - return index <= p->indices && index > 0; -} - -/* ------------------------- Class/flow operations ------------------------- */ - -static int dsmark_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", - __func__, sch, p, new, old); - - if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle, NULL); - if (new == NULL) - new = &noop_qdisc; - } - - *old = qdisc_replace(sch, new, &p->q); - return 0; -} - -static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - return p->q; -} - -static unsigned long dsmark_find(struct Qdisc *sch, u32 classid) -{ - return TC_H_MIN(classid) + 1; -} - -static unsigned long dsmark_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) -{ - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", - __func__, sch, qdisc_priv(sch), classid); - - return dsmark_find(sch, classid); -} - -static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl) -{ -} - -static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = { - [TCA_DSMARK_INDICES] = { .type = NLA_U16 }, - [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 }, - [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG }, - [TCA_DSMARK_MASK] = { .type = NLA_U8 }, - [TCA_DSMARK_VALUE] = { .type = NLA_U8 }, -}; - -static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err = -EINVAL; - - pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", - __func__, sch, p, classid, parent, *arg); - - if (!dsmark_valid_index(p, *arg)) { - err = -ENOENT; - goto errout; - } - - if (!opt) - goto errout; - - err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, - dsmark_policy, NULL); - if (err < 0) - goto errout; - - if (tb[TCA_DSMARK_VALUE]) - p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]); - - if (tb[TCA_DSMARK_MASK]) - p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - - err = 0; - -errout: - return err; -} - -static int dsmark_delete(struct Qdisc *sch, unsigned long arg) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - if (!dsmark_valid_index(p, arg)) - return -EINVAL; - - p->mv[arg - 1].mask = 0xff; - p->mv[arg - 1].value = 0; - - return 0; -} - -static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - int i; - - pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", - __func__, sch, p, walker); - - if (walker->stop) - return; - - for (i = 0; i < p->indices; i++) { - if (p->mv[i].mask == 0xff && !p->mv[i].value) - goto ignore; - if (walker->count >= walker->skip) { - if (walker->fn(sch, i + 1, walker) < 0) { - walker->stop = 1; - break; - } - } -ignore: - walker->count++; - } -} - -static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - return p->block; -} - -/* --------------------------- Qdisc operations ---------------------------- */ - -static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - unsigned int len = qdisc_pkt_len(skb); - struct dsmark_qdisc_data *p = qdisc_priv(sch); - int err; - - pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); - - if (p->set_tc_index) { - int wlen = skb_network_offset(skb); - - switch (skb_protocol(skb, true)) { - case htons(ETH_P_IP): - wlen += sizeof(struct iphdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) - goto drop; - - skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) - & ~INET_ECN_MASK; - break; - - case htons(ETH_P_IPV6): - wlen += sizeof(struct ipv6hdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) - goto drop; - - skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) - & ~INET_ECN_MASK; - break; - default: - skb->tc_index = 0; - break; - } - } - - if (TC_H_MAJ(skb->priority) == sch->handle) - skb->tc_index = TC_H_MIN(skb->priority); - else { - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tcf_classify(skb, fl, &res, false); - - pr_debug("result %d class 0x%04x\n", result, res.classid); - - switch (result) { -#ifdef CONFIG_NET_CLS_ACT - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - __qdisc_drop(skb, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - - case TC_ACT_SHOT: - goto drop; -#endif - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - break; - - default: - if (p->default_index != NO_DEFAULT_INDEX) - skb->tc_index = p->default_index; - break; - } - } - - err = qdisc_enqueue(skb, p->q, to_free); - if (err != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(err)) - qdisc_qstats_drop(sch); - return err; - } - - sch->qstats.backlog += len; - sch->q.qlen++; - - return NET_XMIT_SUCCESS; - -drop: - qdisc_drop(skb, sch, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -} - -static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct sk_buff *skb; - u32 index; - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - skb = qdisc_dequeue_peeked(p->q); - if (skb == NULL) - return NULL; - - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - index = skb->tc_index & (p->indices - 1); - pr_debug("index %d->%d\n", skb->tc_index, index); - - switch (skb_protocol(skb, true)) { - case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, - p->mv[index].value); - break; - case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask, - p->mv[index].value); - break; - default: - /* - * Only complain if a change was actually attempted. - * This way, we can send non-IP traffic through dsmark - * and don't need yet another qdisc as a bypass. - */ - if (p->mv[index].mask != 0xff || p->mv[index].value) - pr_warn("%s: unsupported protocol %d\n", - __func__, ntohs(skb_protocol(skb, true))); - break; - } - - return skb; -} - -static struct sk_buff *dsmark_peek(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - return p->q->ops->peek(p->q); -} - -static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err = -EINVAL; - u32 default_index = NO_DEFAULT_INDEX; - u16 indices; - int i; - - pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); - - if (!opt) - goto errout; - - err = tcf_block_get(&p->block, &p->filter_list, sch, extack); - if (err) - return err; - - err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, - dsmark_policy, NULL); - if (err < 0) - goto errout; - - err = -EINVAL; - if (!tb[TCA_DSMARK_INDICES]) - goto errout; - indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); - - if (hweight32(indices) != 1) - goto errout; - - if (tb[TCA_DSMARK_DEFAULT_INDEX]) - default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); - - if (indices <= DSMARK_EMBEDDED_SZ) - p->mv = p->embedded; - else - p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL); - if (!p->mv) { - err = -ENOMEM; - goto errout; - } - for (i = 0; i < indices; i++) { - p->mv[i].mask = 0xff; - p->mv[i].value = 0; - } - p->indices = indices; - p->default_index = default_index; - p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); - - p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, - NULL); - if (p->q == NULL) - p->q = &noop_qdisc; - else - qdisc_hash_add(p->q, true); - - pr_debug("%s: qdisc %p\n", __func__, p->q); - - err = 0; -errout: - return err; -} - -static void dsmark_reset(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - if (p->q) - qdisc_reset(p->q); - sch->qstats.backlog = 0; - sch->q.qlen = 0; -} - -static void dsmark_destroy(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - tcf_block_put(p->block); - qdisc_put(p->q); - if (p->mv != p->embedded) - kfree(p->mv); -} - -static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opts = NULL; - - pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl); - - if (!dsmark_valid_index(p, cl)) - return -EINVAL; - - tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1); - tcm->tcm_info = p->q->handle; - - opts = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (opts == NULL) - goto nla_put_failure; - if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) || - nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value)) - goto nla_put_failure; - - return nla_nest_end(skb, opts); - -nla_put_failure: - nla_nest_cancel(skb, opts); - return -EMSGSIZE; -} - -static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opts = NULL; - - opts = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (opts == NULL) - goto nla_put_failure; - if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) - goto nla_put_failure; - - if (p->default_index != NO_DEFAULT_INDEX && - nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index)) - goto nla_put_failure; - - if (p->set_tc_index && - nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX)) - goto nla_put_failure; - - return nla_nest_end(skb, opts); - -nla_put_failure: - nla_nest_cancel(skb, opts); - return -EMSGSIZE; -} - -static const struct Qdisc_class_ops dsmark_class_ops = { - .graft = dsmark_graft, - .leaf = dsmark_leaf, - .find = dsmark_find, - .change = dsmark_change, - .delete = dsmark_delete, - .walk = dsmark_walk, - .tcf_block = dsmark_tcf_block, - .bind_tcf = dsmark_bind_filter, - .unbind_tcf = dsmark_unbind_filter, - .dump = dsmark_dump_class, -}; - -static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { - .next = NULL, - .cl_ops = &dsmark_class_ops, - .id = "dsmark", - .priv_size = sizeof(struct dsmark_qdisc_data), - .enqueue = dsmark_enqueue, - .dequeue = dsmark_dequeue, - .peek = dsmark_peek, - .init = dsmark_init, - .reset = dsmark_reset, - .destroy = dsmark_destroy, - .change = NULL, - .dump = dsmark_dump, - .owner = THIS_MODULE, -}; - -static int __init dsmark_module_init(void) -{ - return register_qdisc(&dsmark_qdisc_ops); -} - -static void __exit dsmark_module_exit(void) -{ - unregister_qdisc(&dsmark_qdisc_ops); -} - -module_init(dsmark_module_init) -module_exit(dsmark_module_exit) - -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 86fb2f953bd5..30796bb54972 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -188,7 +188,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct fq_codel_sched_data *q = qdisc_priv(sch); unsigned int idx, prev_backlog, prev_qlen; struct fq_codel_flow *flow; - int uninitialized_var(ret); + int ret; unsigned int pkt_len; bool memory_limited; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ae5847de94c8..4250f3cf30e7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -403,7 +403,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = dev_tx_weight; + int quota = READ_ONCE(dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { @@ -1106,18 +1106,21 @@ static void attach_default_qdiscs(struct net_device *dev) if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { - dev->qdisc = qdisc; + rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } + qdisc = rtnl_dereference(dev->qdisc); + #ifdef CONFIG_NET_SCHED - if (dev->qdisc != &noop_qdisc) - qdisc_hash_add(dev->qdisc, false); + if (qdisc != &noop_qdisc) + qdisc_hash_add(qdisc, false); #endif } @@ -1147,7 +1150,7 @@ void dev_activate(struct net_device *dev) * and noqueue_qdisc for virtual interfaces */ - if (dev->qdisc == &noop_qdisc) + if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) @@ -1316,7 +1319,7 @@ static int qdisc_change_tx_queue_len(struct net_device *dev, void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { - struct Qdisc *qdisc = dev->qdisc; + struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); @@ -1356,7 +1359,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, void dev_init_scheduler(struct net_device *dev) { - dev->qdisc = &noop_qdisc; + rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); @@ -1384,8 +1387,8 @@ void dev_shutdown(struct net_device *dev) netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - qdisc_put(dev->qdisc); - dev->qdisc = &noop_qdisc; + qdisc_put(rtnl_dereference(dev->qdisc)); + rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 433f2190960f..9ebae0d07a9c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -903,6 +903,14 @@ hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, cl->cl_flags |= HFSC_USC; } +static void +hfsc_upgrade_rt(struct hfsc_class *cl) +{ + cl->cl_fsc = cl->cl_rsc; + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); + cl->cl_flags |= HFSC_FSC; +} + static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = { [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) }, [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) }, @@ -1064,6 +1072,12 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->cf_tree = RB_ROOT; sch_tree_lock(sch); + /* Check if the inner class is a misconfigured 'rt' */ + if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) { + NL_SET_ERR_MSG(extack, + "Forced curve change on parent 'rt' to 'sc'"); + hfsc_upgrade_rt(parent); + } qdisc_class_hash_insert(&q->clhash, &cl->cl_common); list_add_tail(&cl->siblings, &parent->children); if (parent->level == 0) @@ -1533,7 +1547,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct hfsc_class *cl; - int uninitialized_var(err); + int err; bool first; cl = hfsc_classify(skb, sch, &err); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 8184c87da8be..4cdf2dd04701 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -405,7 +405,10 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) while (cl->cmode == HTB_MAY_BORROW && p && mask) { m = mask; while (m) { - int prio = ffz(~m); + unsigned int prio = ffz(~m); + + if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio))) + break; m &= ~(1 << prio); if (p->inner.clprio[prio].feed.rb_node) @@ -579,7 +582,7 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - int uninitialized_var(ret); + int ret; unsigned int len = qdisc_pkt_len(skb); struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = htb_classify(skb, sch, &ret); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index bf56aa519797..28168799ca1b 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -79,6 +79,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + if (sch->parent != TC_H_INGRESS) + return -EOPNOTSUPP; + net_inc_ingress_queue(); mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); @@ -94,6 +97,9 @@ static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + if (sch->parent != TC_H_INGRESS) + return; + tcf_block_put_ext(q->block, sch, &q->block_info); net_dec_ingress_queue(); } @@ -127,7 +133,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_sched_data), - .static_flags = TCQ_F_CPUSTATS, + .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -212,6 +218,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, struct net_device *dev = qdisc_dev(sch); int err; + if (sch->parent != TC_H_CLSACT) + return -EOPNOTSUPP; + net_inc_ingress_queue(); net_inc_egress_queue(); @@ -239,6 +248,9 @@ static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + if (sch->parent != TC_H_CLSACT) + return; + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); @@ -260,7 +272,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", .priv_size = sizeof(struct clsact_sched_data), - .static_flags = TCQ_F_CPUSTATS, + .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 50e15add6068..56d3dc5e95c7 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -130,6 +130,97 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } +static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt, + struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct mqprio_sched *priv = qdisc_priv(sch); + struct nlattr *tb[TCA_MQPRIO_MAX + 1]; + struct nlattr *attr; + int i, rem, err; + + err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, + sizeof(*qopt)); + if (err < 0) + return err; + + if (!qopt->hw) { + NL_SET_ERR_MSG(extack, + "mqprio TCA_OPTIONS can only contain netlink attributes in hardware mode"); + return -EINVAL; + } + + if (tb[TCA_MQPRIO_MODE]) { + priv->flags |= TC_MQPRIO_F_MODE; + priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); + } + + if (tb[TCA_MQPRIO_SHAPER]) { + priv->flags |= TC_MQPRIO_F_SHAPER; + priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); + } + + if (tb[TCA_MQPRIO_MIN_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) { + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MIN_RATE64], + "min_rate accepted only when shaper is in bw_rlimit mode"); + return -EINVAL; + } + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Attribute type expected to be TCA_MQPRIO_MIN_RATE64"); + return -EINVAL; + } + + if (nla_len(attr) != sizeof(u64)) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Attribute TCA_MQPRIO_MIN_RATE64 expected to have 8 bytes length"); + return -EINVAL; + } + + if (i >= qopt->num_tc) + break; + priv->min_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MIN_RATE; + } + + if (tb[TCA_MQPRIO_MAX_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) { + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MAX_RATE64], + "max_rate accepted only when shaper is in bw_rlimit mode"); + return -EINVAL; + } + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Attribute type expected to be TCA_MQPRIO_MAX_RATE64"); + return -EINVAL; + } + + if (nla_len(attr) != sizeof(u64)) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Attribute TCA_MQPRIO_MAX_RATE64 expected to have 8 bytes length"); + return -EINVAL; + } + + if (i >= qopt->num_tc) + break; + priv->max_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MAX_RATE; + } + + return 0; +} + static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -139,9 +230,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; - struct nlattr *tb[TCA_MQPRIO_MAX + 1]; - struct nlattr *attr; - int rem; int len; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); @@ -166,55 +254,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); if (len > 0) { - err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, - sizeof(*qopt)); - if (err < 0) + err = mqprio_parse_nlattr(sch, qopt, opt, extack); + if (err) return err; - - if (!qopt->hw) - return -EINVAL; - - if (tb[TCA_MQPRIO_MODE]) { - priv->flags |= TC_MQPRIO_F_MODE; - priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); - } - - if (tb[TCA_MQPRIO_SHAPER]) { - priv->flags |= TC_MQPRIO_F_SHAPER; - priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); - } - - if (tb[TCA_MQPRIO_MIN_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->min_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MIN_RATE; - } - - if (tb[TCA_MQPRIO_MAX_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->max_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MAX_RATE; - } } /* pre-allocate qdisc, attachment can't fail */ diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 1802f134aa40..265a02b6ad09 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -773,12 +773,10 @@ static void dist_free(struct disttable *d) * signed 16 bit values. */ -static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, - const struct nlattr *attr) +static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) { size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); - spinlock_t *root_lock; struct disttable *d; int i; @@ -793,13 +791,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, for (i = 0; i < n; i++) d->table[i] = data[i]; - root_lock = qdisc_root_sleeping_lock(sch); - - spin_lock_bh(root_lock); - swap(*tbl, d); - spin_unlock_bh(root_lock); - - dist_free(d); + *tbl = d; return 0; } @@ -956,6 +948,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, { struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; + struct disttable *delay_dist = NULL; + struct disttable *slot_dist = NULL; struct tc_netem_qopt *qopt; struct clgstate old_clg; int old_loss_model = CLG_RANDOM; @@ -969,6 +963,19 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (ret < 0) return ret; + if (tb[TCA_NETEM_DELAY_DIST]) { + ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]); + if (ret) + goto table_free; + } + + if (tb[TCA_NETEM_SLOT_DIST]) { + ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]); + if (ret) + goto table_free; + } + + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; @@ -977,26 +984,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; - return ret; + q->clg = old_clg; + goto unlock; } } else { q->loss_model = CLG_RANDOM; } - if (tb[TCA_NETEM_DELAY_DIST]) { - ret = get_dist_table(sch, &q->delay_dist, - tb[TCA_NETEM_DELAY_DIST]); - if (ret) - goto get_table_failure; - } - - if (tb[TCA_NETEM_SLOT_DIST]) { - ret = get_dist_table(sch, &q->slot_dist, - tb[TCA_NETEM_SLOT_DIST]); - if (ret) - goto get_table_failure; - } - + if (delay_dist) + swap(q->delay_dist, delay_dist); + if (slot_dist) + swap(q->slot_dist, slot_dist); sch->limit = qopt->limit; q->latency = PSCHED_TICKS2NS(qopt->latency); @@ -1044,15 +1042,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); - return ret; +unlock: + sch_tree_unlock(sch); -get_table_failure: - /* recover clg and loss_model, in case of - * q->clg and q->loss_model were modified - * in get_loss_clg() - */ - q->clg = old_clg; - q->loss_model = old_loss_model; +table_free: + dist_free(delay_dist); + dist_free(slot_dist); return ret; } diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index cbc2ebca4548..339990bb5981 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -210,7 +210,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .priv_size = sizeof(struct plug_sched_data), .enqueue = plug_enqueue, .dequeue = plug_dequeue, - .peek = qdisc_peek_head, + .peek = qdisc_peek_dequeued, .init = plug_init, .change = plug_change, .reset = qdisc_reset_queue, diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 1eb339d224ae..6e9e3405f26b 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -375,8 +375,13 @@ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, u32 lmax) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); + struct qfq_aggregate *new_agg; + /* 'lmax' can range from [QFQ_MIN_LMAX, pktlen + stab overhead] */ + if (lmax > (1UL << QFQ_MTU_SHIFT)) + return -EINVAL; + + new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); if (new_agg == NULL) @@ -421,15 +426,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else weight = 1; - if (tb[TCA_QFQ_LMAX]) { + if (tb[TCA_QFQ_LMAX]) lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); - if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { - pr_notice("qfq: invalid max length %u\n", lmax); - return -EINVAL; - } - } else + else lmax = psched_mtu(qdisc_dev(sch)); + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; @@ -969,10 +975,13 @@ static void qfq_update_eligible(struct qfq_sched *q) } /* Dequeue head packet of the head class in the DRR queue of the aggregate. */ -static void agg_dequeue(struct qfq_aggregate *agg, - struct qfq_class *cl, unsigned int len) +static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, + struct qfq_class *cl, unsigned int len) { - qdisc_dequeue_peeked(cl->qdisc); + struct sk_buff *skb = qdisc_dequeue_peeked(cl->qdisc); + + if (!skb) + return NULL; cl->deficit -= (int) len; @@ -982,6 +991,8 @@ static void agg_dequeue(struct qfq_aggregate *agg, cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } + + return skb; } static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, @@ -1127,11 +1138,18 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) if (!skb) return NULL; - qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; + + skb = agg_dequeue(in_serv_agg, cl, len); + + if (!skb) { + sch->q.qlen++; + return NULL; + } + + qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); - agg_dequeue(in_serv_agg, cl, len); /* If lmax is lowered, through qfq_change_class, for a class * owning pending packets with larger size than the new value * of lmax, then the following condition may hold. diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7741f102be4a..476853ff6989 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -59,6 +59,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; + unsigned int len; int ret; q->vars.qavg = red_calc_qavg(&q->parms, @@ -94,9 +95,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; } + len = qdisc_pkt_len(skb); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 4074c50ac3d7..3aa6b4dcb1c8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -135,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) } } -static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; - sfbhash = sfb_hash(skb, 0); + sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); - sfbhash = sfb_hash(skb, 1); + sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } @@ -281,8 +281,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct sfb_sched_data *q = qdisc_priv(sch); + unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; + struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -399,11 +401,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } enqueue: + memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; - increment_qlen(skb, q); + increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b92bafaf83f3..d7f910610de9 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -349,7 +349,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; - int uninitialized_var(ret); + int ret; struct sk_buff *head; int delta; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 4c26f7fb32b3..e4c4d23a1b53 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -65,6 +65,7 @@ struct taprio_sched { u32 flags; enum tk_offsets tk_offset; int clockid; + bool offloaded; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ @@ -1268,6 +1269,8 @@ static int taprio_enable_offload(struct net_device *dev, goto done; } + q->offloaded = true; + done: taprio_offload_free(offload); @@ -1282,12 +1285,9 @@ static int taprio_disable_offload(struct net_device *dev, struct tc_taprio_qopt_offload *offload; int err; - if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) + if (!q->offloaded) return 0; - if (!ops->ndo_setup_tc) - return -EOPNOTSUPP; - offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, @@ -1303,6 +1303,8 @@ static int taprio_disable_offload(struct net_device *dev, goto out; } + q->offloaded = false; + out: taprio_offload_free(offload); @@ -1620,6 +1622,7 @@ static void taprio_reset(struct Qdisc *sch) int i; hrtimer_cancel(&q->advance_timer); + if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) qdisc_reset(q->qdiscs[i]); @@ -1642,6 +1645,7 @@ static void taprio_destroy(struct Qdisc *sch) * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); + qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 5f72f3f916a5..a7f60bb2dd51 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -297,6 +297,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; struct Qdisc *child = NULL; + struct Qdisc *old = NULL; struct psched_ratecfg rate; struct psched_ratecfg peak; u64 max_size; @@ -388,7 +389,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { qdisc_tree_flush_backlog(q->qdisc); - qdisc_put(q->qdisc); + old = q->qdisc; q->qdisc = child; } q->limit = qopt->limit; @@ -408,6 +409,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); sch_tree_unlock(sch); + qdisc_put(old); err = 0; done: return err; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index fb6f62264e87..2cdcb72c8826 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -224,9 +224,8 @@ static struct sctp_association *sctp_association_init( if (!sctp_ulpq_init(&asoc->ulpq, asoc)) goto fail_init; - if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, - 0, gfp)) - goto fail_init; + if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp)) + goto stream_free; /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; @@ -1157,8 +1156,7 @@ int sctp_assoc_update(struct sctp_association *asoc, /* Add any peer addresses from the new association. */ list_for_each_entry(trans, &new->peer.transport_addr_list, transports) - if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr) && - !sctp_assoc_add_peer(asoc, &trans->ipaddr, + if (!sctp_assoc_add_peer(asoc, &trans->ipaddr, GFP_ATOMIC, trans->state)) return -ENOMEM; diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 3b2d0bd616dd..6b97b734a16f 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -869,12 +869,17 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, } list_del_init(&shkey->key_list); - sctp_auth_shkey_release(shkey); list_add(&cur_key->key_list, sh_keys); - if (asoc && asoc->active_key_id == auth_key->sca_keynumber) - sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL); + if (asoc && asoc->active_key_id == auth_key->sca_keynumber && + sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { + list_del_init(&cur_key->key_list); + sctp_auth_shkey_release(cur_key); + list_add(&shkey->key_list, sh_keys); + return -ENOMEM; + } + sctp_auth_shkey_release(shkey); return 0; } @@ -908,8 +913,13 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep, return -EINVAL; if (asoc) { + __u16 active_key_id = asoc->active_key_id; + asoc->active_key_id = key_id; - sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL); + if (sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { + asoc->active_key_id = active_key_id; + return -ENOMEM; + } } else ep->active_key_id = key_id; diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index a825e74d01fc..614bc081ca50 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -73,6 +73,12 @@ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, } } + /* If somehow no addresses were found that can be used with this + * scope, it's an error. + */ + if (list_empty(&dest->address_list)) + error = -ENETUNREACH; + out: if (error) sctp_bind_addr_clean(dest); diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 5a918e74bb82..2d0318a7352c 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -349,11 +349,9 @@ static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) + if (!list_is_first(&tsp->asoc->asocs, &ep->asocs)) return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 982a87b3e11f..963b94517ec2 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -284,7 +284,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) assoc->init_retries, assoc->shutdown_retries, assoc->rtx_data_chunks, refcount_read(&sk->sk_wmem_alloc), - sk->sk_wmem_queued, + READ_ONCE(sk->sk_wmem_queued), sk->sk_sndbuf, sk->sk_rcvbuf); seq_printf(seq, "\n"); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 8d32229199b9..c964e7ca6f7e 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1240,7 +1240,10 @@ static int sctp_side_effects(enum sctp_event_type event_type, default: pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", status, state, event_type, subtype.chunk); - BUG(); + error = status; + if (error >= 0) + error = -EINVAL; + WARN_ON_ONCE(1); break; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 1d2f633c6c7c..67df4022853b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4379,7 +4379,7 @@ enum sctp_disposition sctp_sf_eat_auth(struct net *net, SCTP_AUTH_NEW_KEY, GFP_ATOMIC); if (!ev) - return -ENOMEM; + return SCTP_DISPOSITION_NOMEM; sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c76b40322ac7..cbcbc92748ba 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -68,7 +68,7 @@ #include <net/sctp/stream_sched.h> /* Forward declarations for internal helper functions. */ -static bool sctp_writeable(struct sock *sk); +static bool sctp_writeable(const struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, size_t msg_len); @@ -97,7 +97,7 @@ struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) { - sctp_memory_pressure = 1; + WRITE_ONCE(sctp_memory_pressure, 1); } @@ -138,7 +138,7 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk); - sk->sk_wmem_queued += chunk->skb->truesize + sizeof(struct sctp_chunk); + sk_wmem_queued_add(sk, chunk->skb->truesize + sizeof(struct sctp_chunk)); sk_mem_charge(sk, chunk->skb->truesize); } @@ -362,9 +362,9 @@ static void sctp_auto_asconf_init(struct sctp_sock *sp) struct net *net = sock_net(&sp->inet.sk); if (net->sctp.default_auto_asconf) { - spin_lock(&net->sctp.addr_wq_lock); + spin_lock_bh(&net->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); - spin_unlock(&net->sctp.addr_wq_lock); + spin_unlock_bh(&net->sctp.addr_wq_lock); sp->do_auto_asconf = 1; } } @@ -1850,6 +1850,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) goto err; + if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) { + err = -EINVAL; + goto err; + } } if (sctp_state(asoc, CLOSED)) { @@ -2482,6 +2486,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, if (trans) { trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + sctp_transport_reset_hb_timer(trans); } else if (asoc) { asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); @@ -5163,13 +5168,17 @@ static void sctp_destroy_sock(struct sock *sk) } /* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_sock(struct sock *sk) +static void sctp_destruct_common(struct sock *sk) { struct sctp_sock *sp = sctp_sk(sk); /* Free up the HMAC transform. */ crypto_free_shash(sp->hmac); +} +static void sctp_destruct_sock(struct sock *sk) +{ + sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -8989,7 +8998,7 @@ static void sctp_wfree(struct sk_buff *skb) struct sock *sk = asoc->base.sk; sk_mem_uncharge(sk, skb->truesize); - sk->sk_wmem_queued -= skb->truesize + sizeof(struct sctp_chunk); + sk_wmem_queued_add(sk, -(skb->truesize + sizeof(struct sctp_chunk))); asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk); WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc)); @@ -9144,9 +9153,9 @@ void sctp_write_space(struct sock *sk) * UDP-style sockets or TCP-style sockets, this code should work. * - Daisy */ -static bool sctp_writeable(struct sock *sk) +static bool sctp_writeable(const struct sock *sk) { - return sk->sk_sndbuf > sk->sk_wmem_queued; + return READ_ONCE(sk->sk_sndbuf) > READ_ONCE(sk->sk_wmem_queued); } /* Wait for an association to go into ESTABLISHED state. If timeout is 0, @@ -9304,7 +9313,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; - newsk->sk_destruct = sctp_destruct_sock; + newsk->sk_destruct = sk->sk_destruct; newsk->sk_family = sk->sk_family; newsk->sk_protocol = IPPROTO_SCTP; newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; @@ -9535,11 +9544,20 @@ struct proto sctp_prot = { #if IS_ENABLED(CONFIG_IPV6) -#include <net/transp_v6.h> -static void sctp_v6_destroy_sock(struct sock *sk) +static void sctp_v6_destruct_sock(struct sock *sk) { - sctp_destroy_sock(sk); - inet6_destroy_sock(sk); + sctp_destruct_common(sk); + inet6_sock_destruct(sk); +} + +static int sctp_v6_init_sock(struct sock *sk) +{ + int ret = sctp_init_sock(sk); + + if (!ret) + sk->sk_destruct = sctp_v6_destruct_sock; + + return ret; } struct proto sctpv6_prot = { @@ -9549,8 +9567,8 @@ struct proto sctpv6_prot = { .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, - .init = sctp_init_sock, - .destroy = sctp_v6_destroy_sock, + .init = sctp_v6_init_sock, + .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, diff --git a/net/sctp/stream.c b/net/sctp/stream.c index cd20638b6151..ca8fdc9abca5 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -52,6 +52,19 @@ static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) } } +static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) +{ + struct sctp_sched_ops *sched; + + if (!SCTP_SO(stream, sid)->ext) + return; + + sched = sctp_sched_ops_from_stream(stream); + sched->free_sid(stream, sid); + kfree(SCTP_SO(stream, sid)->ext); + SCTP_SO(stream, sid)->ext = NULL; +} + /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. @@ -70,16 +83,14 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { - kfree(SCTP_SO(new, i)->ext); + sctp_stream_free_ext(new, i); SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; SCTP_SO(stream, i)->ext = NULL; } } - for (i = outcnt; i < stream->outcnt; i++) { - kfree(SCTP_SO(stream, i)->ext); - SCTP_SO(stream, i)->ext = NULL; - } + for (i = outcnt; i < stream->outcnt; i++) + sctp_stream_free_ext(stream, i); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, @@ -137,7 +148,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) - goto out_err; + return ret; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; @@ -145,22 +156,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, handle_in: sctp_stream_interleave_init(stream); if (!incnt) - goto out; + return 0; - ret = sctp_stream_alloc_in(stream, incnt, gfp); - if (ret) - goto in_err; - - goto out; - -in_err: - sched->free(stream); - genradix_free(&stream->in); -out_err: - genradix_free(&stream->out); - stream->outcnt = 0; -out: - return ret; + return sctp_stream_alloc_in(stream, incnt, gfp); } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) @@ -187,9 +185,9 @@ void sctp_stream_free(struct sctp_stream *stream) struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; - sched->free(stream); + sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) - kfree(SCTP_SO(stream, i)->ext); + sctp_stream_free_ext(stream, i); genradix_free(&stream->out); genradix_free(&stream->in); } diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 40c40be23fcb..c982f99099de 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -1165,7 +1165,8 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = chunk->subh.ifwdtsn_hdr->skip; \ - (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++) + (void *)pos <= (void *)chunk->subh.ifwdtsn_hdr->skip + (end) - \ + sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 99e5f69fbb74..33c2630c2496 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -46,6 +46,10 @@ static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, return 0; } +static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) +{ +} + static void sctp_sched_fcfs_free(struct sctp_stream *stream) { } @@ -96,6 +100,7 @@ static struct sctp_sched_ops sctp_sched_fcfs = { .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, + .free_sid = sctp_sched_fcfs_free_sid, .free = sctp_sched_fcfs_free, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, @@ -163,7 +168,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc, if (!SCTP_SO(&asoc->stream, i)->ext) continue; - ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); + ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 80b5a2c4cbc7..7dd9f8b387cc 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -25,6 +25,18 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); +static struct sctp_stream_priorities *sctp_sched_prio_head_get(struct sctp_stream_priorities *p) +{ + p->users++; + return p; +} + +static void sctp_sched_prio_head_put(struct sctp_stream_priorities *p) +{ + if (p && --p->users == 0) + kfree(p); +} + static struct sctp_stream_priorities *sctp_sched_prio_new_head( struct sctp_stream *stream, int prio, gfp_t gfp) { @@ -38,6 +50,7 @@ static struct sctp_stream_priorities *sctp_sched_prio_new_head( INIT_LIST_HEAD(&p->active); p->next = NULL; p->prio = prio; + p->users = 1; return p; } @@ -53,7 +66,7 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head( */ list_for_each_entry(p, &stream->prio_list, prio_sched) { if (p->prio == prio) - return p; + return sctp_sched_prio_head_get(p); if (p->prio > prio) break; } @@ -70,7 +83,7 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head( */ break; if (p->prio == prio) - return p; + return sctp_sched_prio_head_get(p); } /* If not even there, allocate a new one. */ @@ -154,32 +167,21 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, struct sctp_stream_out_ext *soute = sout->ext; struct sctp_stream_priorities *prio_head, *old; bool reschedule = false; - int i; + + old = soute->prio_head; + if (old && old->prio == prio) + return 0; prio_head = sctp_sched_prio_get_head(stream, prio, gfp); if (!prio_head) return -ENOMEM; reschedule = sctp_sched_prio_unsched(soute); - old = soute->prio_head; soute->prio_head = prio_head; if (reschedule) sctp_sched_prio_sched(stream, soute); - if (!old) - /* Happens when we set the priority for the first time */ - return 0; - - for (i = 0; i < stream->outcnt; i++) { - soute = SCTP_SO(stream, i)->ext; - if (soute && soute->prio_head == old) - /* It's still in use, nothing else to do here. */ - return 0; - } - - /* No hits, we are good to free it. */ - kfree(old); - + sctp_sched_prio_head_put(old); return 0; } @@ -204,6 +206,12 @@ static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, return sctp_sched_prio_set(stream, sid, 0, gfp); } +static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid) +{ + sctp_sched_prio_head_put(SCTP_SO(stream, sid)->ext->prio_head); + SCTP_SO(stream, sid)->ext->prio_head = NULL; +} + static void sctp_sched_prio_free(struct sctp_stream *stream) { struct sctp_stream_priorities *prio, *n; @@ -323,6 +331,7 @@ static struct sctp_sched_ops sctp_sched_prio = { .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, .init_sid = sctp_sched_prio_init_sid, + .free_sid = sctp_sched_prio_free_sid, .free = sctp_sched_prio_free, .enqueue = sctp_sched_prio_enqueue, .dequeue = sctp_sched_prio_dequeue, diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index ff425aed62c7..cc444fe0d67c 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -90,6 +90,10 @@ static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, return 0; } +static void sctp_sched_rr_free_sid(struct sctp_stream *stream, __u16 sid) +{ +} + static void sctp_sched_rr_free(struct sctp_stream *stream) { sctp_sched_rr_unsched_all(stream); @@ -177,6 +181,7 @@ static struct sctp_sched_ops sctp_sched_rr = { .get = sctp_sched_rr_get, .init = sctp_sched_rr_init, .init_sid = sctp_sched_rr_init_sid, + .free_sid = sctp_sched_rr_free_sid, .free = sctp_sched_rr_free, .enqueue = sctp_sched_rr_enqueue, .dequeue = sctp_sched_rr_dequeue, diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 394491692a07..e070b0e2a30c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -136,7 +136,7 @@ static int __smc_release(struct smc_sock *smc) if (!smc->use_fallback) { rc = smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { if (sk->sk_state != SMC_CLOSED) { @@ -928,7 +928,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; - sock_set_flag(new_sk, SOCK_DEAD); + smc_sock_set_flag(new_sk, SOCK_DEAD); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; @@ -1093,7 +1093,6 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; - sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; @@ -1543,16 +1542,14 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = -EPIPE; + int rc; smc = smc_sk(sk); lock_sock(sk); - if ((sk->sk_state != SMC_ACTIVE) && - (sk->sk_state != SMC_APPCLOSEWAIT1) && - (sk->sk_state != SMC_INIT)) - goto out; + /* SMC does not support connect with fastopen */ if (msg->msg_flags & MSG_FASTOPEN) { + /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; @@ -1560,6 +1557,11 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) rc = -EINVAL; goto out; } + } else if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_INIT)) { + rc = -EPIPE; + goto out; } if (smc->use_fallback) diff --git a/net/smc/smc.h b/net/smc/smc.h index 878313f8d6c1..8c46eaf014a7 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -265,4 +265,9 @@ static inline bool using_ipsec(struct smc_sock *smc) struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); +static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) +{ + set_bit(flag, &sk->sk_flags); +} + #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index d0b0f4c865b4..68e1155126cf 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -298,7 +298,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc->sk.sk_shutdown |= RCV_SHUTDOWN; if (smc->clcsock && smc->clcsock->sk) smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(&smc->sk, SOCK_DONE); + smc_sock_set_flag(&smc->sk, SOCK_DONE); sock_hold(&smc->sk); /* sock_put in close_work */ if (!schedule_work(&conn->close_work)) sock_put(&smc->sk); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 543948d970c5..a704234c1a2b 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -164,7 +164,7 @@ static void smc_close_active_abort(struct smc_sock *smc) break; } - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); } diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index da9ba6d1679b..3d8f551cec30 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -168,7 +168,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, } if (smc->conn.lgr && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && - !list_empty(&smc->conn.lgr->list)) { + !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; diff --git a/net/socket.c b/net/socket.c index 94358566c9d1..e3a50f107d64 100644 --- a/net/socket.c +++ b/net/socket.c @@ -641,6 +641,14 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) return ret; } +static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) +{ + int err = security_socket_sendmsg(sock, msg, + msg_data_left(msg)); + + return err ?: sock_sendmsg_nosec(sock, msg); +} + /** * sock_sendmsg - send a message through @sock * @sock: socket @@ -651,10 +659,21 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { - int err = security_socket_sendmsg(sock, msg, - msg_data_left(msg)); + struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; + struct sockaddr_storage address; + int save_len = msg->msg_namelen; + int ret; - return err ?: sock_sendmsg_nosec(sock, msg); + if (msg->msg_name) { + memcpy(&address, msg->msg_name, msg->msg_namelen); + msg->msg_name = &address; + } + + ret = __sock_sendmsg(sock, msg); + msg->msg_name = save_addr; + msg->msg_namelen = save_len; + + return ret; } EXPORT_SYMBOL(sock_sendmsg); @@ -986,7 +1005,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = sock_sendmsg(sock, &msg); + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } @@ -1661,7 +1680,7 @@ int __sys_listen(int fd, int backlog) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { - somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; + somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); if ((unsigned int)backlog > somaxconn) backlog = somaxconn; @@ -1938,7 +1957,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg); + err = __sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); @@ -2283,7 +2302,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } - err = sock_sendmsg(sock, msg_sys); + err = __sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. @@ -2723,7 +2742,7 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, * error to return on the next call or if the * app asks about it using getsockopt(SO_ERROR). */ - sock->sk->sk_err = -err; + WRITE_ONCE(sock->sk->sk_err, -err); } out_put: fput_light(sock->file, fput_needed); @@ -3567,7 +3586,11 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { - return sock->ops->bind(sock, addr, addrlen); + struct sockaddr_storage address; + + memcpy(&address, addr, addrlen); + + return sock->ops->bind(sock, (struct sockaddr *)&address, addrlen); } EXPORT_SYMBOL(kernel_bind); @@ -3637,7 +3660,11 @@ EXPORT_SYMBOL(kernel_accept); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { - return sock->ops->connect(sock, addr, addrlen, flags); + struct sockaddr_storage address; + + memcpy(&address, addr, addrlen); + + return sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 0d4a2bb09589..be2fc88f5bde 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -288,10 +288,10 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags) } if (snprintf(portbuf, sizeof(portbuf), - ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf)) + ".%u.%u", port >> 8, port & 0xff) >= (int)sizeof(portbuf)) return NULL; - if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf)) + if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) >= sizeof(addrbuf)) return NULL; return kstrdup(addrbuf, gfp_flags); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index cdb05b48de44..e8fa21ad06a0 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -494,7 +494,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ - if (!time_in_range(cred->cr_expire, expired, jiffies)) + if (time_in_range(cred->cr_expire, expired, jiffies)) continue; if (!rpcauth_unhash_cred(cred)) continue; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index b7a71578bd98..4d3cf146f50a 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -301,7 +301,7 @@ __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; - if (auth && pos->auth->service != auth->service) + if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; @@ -683,6 +683,21 @@ out: return err; } +static struct gss_upcall_msg * +gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) +{ + struct gss_upcall_msg *pos; + list_for_each_entry(pos, &pipe->in_downcall, list) { + if (!uid_eq(pos->uid, uid)) + continue; + if (!rpc_msg_is_inflight(&pos->msg)) + continue; + refcount_inc(&pos->count); + return pos; + } + return NULL; +} + #define MSG_BUF_MAXSIZE 1024 static ssize_t @@ -729,7 +744,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); - gss_msg = __gss_find_upcall(pipe, uid, NULL); + gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 2ff7b7083eba..e265b8d38aa1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -250,8 +250,8 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); if (!creds) { - kfree(oa->data); - return -ENOMEM; + err = -ENOMEM; + goto free_oa; } oa->data[0].option.data = CREDS_VALUE; @@ -265,29 +265,40 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, /* option buffer */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } length = be32_to_cpup(p); p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } if (length == sizeof(CREDS_VALUE) && memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) { /* We have creds here. parse them */ err = gssx_dec_linux_creds(xdr, creds); if (err) - return err; + goto free_creds; oa->data[0].value.len = 1; /* presence */ } else { /* consume uninteresting buffer */ err = gssx_dec_buffer(xdr, &dummy); if (err) - return err; + goto free_creds; } } return 0; + +free_creds: + kfree(creds); +free_oa: + kfree(oa->data); + oa->data = NULL; + return err; } static int gssx_dec_status(struct xdr_stream *xdr, diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index c0016473a255..1a869a203d52 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1104,18 +1104,23 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, return res; inlen = svc_getnl(argv); - if (inlen > (argv->iov_len + rqstp->rq_arg.page_len)) + if (inlen > (argv->iov_len + rqstp->rq_arg.page_len)) { + kfree(in_handle->data); return SVC_DENIED; + } pages = DIV_ROUND_UP(inlen, PAGE_SIZE); in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL); - if (!in_token->pages) + if (!in_token->pages) { + kfree(in_handle->data); return SVC_DENIED; + } in_token->page_base = 0; in_token->page_len = inlen; for (i = 0; i < pages; i++) { in_token->pages[i] = alloc_page(GFP_KERNEL); if (!in_token->pages[i]) { + kfree(in_handle->data); gss_free_in_token_pages(in_token); return SVC_DENIED; } diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 195b40c5dae4..266f3e5f55de 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -64,6 +64,17 @@ static void xprt_free_allocation(struct rpc_rqst *req) kfree(req); } +static void xprt_bc_reinit_xdr_buf(struct xdr_buf *buf) +{ + buf->head[0].iov_len = PAGE_SIZE; + buf->tail[0].iov_len = 0; + buf->pages = NULL; + buf->page_len = 0; + buf->flags = 0; + buf->len = 0; + buf->buflen = PAGE_SIZE; +} + static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) { struct page *page; @@ -292,6 +303,9 @@ void xprt_free_bc_rqst(struct rpc_rqst *req) */ spin_lock_bh(&xprt->bc_pa_lock); if (xprt_need_to_requeue(xprt)) { + xprt_bc_reinit_xdr_buf(&req->rq_snd_buf); + xprt_bc_reinit_xdr_buf(&req->rq_rcv_buf); + req->rq_rcv_buf.len = PAGE_SIZE; list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); xprt->bc_alloc_count++; atomic_inc(&xprt->bc_slot_count); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 08e1ccc01e98..9071dc6928ac 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1354,7 +1354,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, break; default: err = -EAFNOSUPPORT; - goto out; + goto out_release; } if (err < 0) { dprintk("RPC: can't bind UDP socket (%d)\n", err); @@ -1896,7 +1896,7 @@ call_encode(struct rpc_task *task) break; case -EKEYEXPIRED: if (!task->tk_cred_retry) { - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); } else { task->tk_action = call_refresh; task->tk_cred_retry--; @@ -2003,9 +2003,6 @@ call_bind_status(struct rpc_task *task) status = -EOPNOTSUPP; break; } - if (task->tk_rebind_retry == 0) - break; - task->tk_rebind_retry--; rpc_delay(task, 3*HZ); goto retry_timeout; case -ENOBUFS: @@ -2687,6 +2684,7 @@ out_msg_denied: case rpc_autherr_rejectedverf: case rpcsec_gsserr_credproblem: case rpcsec_gsserr_ctxproblem: + rpcauth_invalcred(task); if (!task->tk_cred_retry) break; task->tk_cred_retry--; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 32ffa801a5b9..a5c6a3d05741 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -827,7 +827,6 @@ rpc_init_task_statistics(struct rpc_task *task) /* Initialize retry counters */ task->tk_garb_retry = 2; task->tk_cred_retry = 2; - task->tk_rebind_retry = 2; /* starting timestamp */ task->tk_start = ktime_get(); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 5c04ba7d456b..5b47e3363239 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -428,14 +428,23 @@ static int unix_gid_hash(kuid_t uid) return hash_long(from_kuid(&init_user_ns, uid), GID_HASHBITS); } -static void unix_gid_put(struct kref *kref) +static void unix_gid_free(struct rcu_head *rcu) { - struct cache_head *item = container_of(kref, struct cache_head, ref); - struct unix_gid *ug = container_of(item, struct unix_gid, h); + struct unix_gid *ug = container_of(rcu, struct unix_gid, rcu); + struct cache_head *item = &ug->h; + if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) put_group_info(ug->gi); - kfree_rcu(ug, rcu); + kfree(ug); +} + +static void unix_gid_put(struct kref *kref) +{ + struct cache_head *item = container_of(kref, struct cache_head, ref); + struct unix_gid *ug = container_of(item, struct unix_gid, h); + + call_rcu(&ug->rcu, unix_gid_free); } static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d52abde51f1b..aeb0b3e48ad5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -728,12 +728,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); - if (svsk) { - /* Refer to svc_setup_socket() for details. */ - rmb(); - svsk->sk_odata(sk); - } - /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -742,15 +736,20 @@ static void svc_tcp_listen_data_ready(struct sock *sk) * when one of child sockets become ESTABLISHED. * 2) data_ready method of the child socket may be called * when it receives data before the socket is accepted. - * In case of 2, we should ignore it silently. + * In case of 2, we should ignore it silently and DO NOT + * dereference svsk. */ - if (sk->sk_state == TCP_LISTEN) { - if (svsk) { - set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } else - printk("svc: socket %p: no user data\n", sk); - } + if (sk->sk_state != TCP_LISTEN) + return; + + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); + svsk->sk_odata(sk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } else + printk("svc: socket %p: no user data\n", sk); } /* diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 78c075a68c04..a11e80d17830 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -253,8 +253,9 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return xprt_switch_find_current_entry(head, xpi->xpi_cursor); } -bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, - const struct sockaddr *sap) +static +bool __rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) { struct list_head *head; struct rpc_xprt *pos; @@ -273,6 +274,18 @@ bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, return false; } +bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) +{ + bool res; + + rcu_read_lock(); + res = __rpc_xprt_switch_has_addr(xps, sap); + rcu_read_unlock(); + + return res; +} + static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 0f4d39fdb48f..cfae1a871578 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1034,9 +1034,9 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, return req; out4: - kfree(req->rl_sendbuf); + rpcrdma_regbuf_free(req->rl_sendbuf); out3: - kfree(req->rl_rdmabuf); + rpcrdma_regbuf_free(req->rl_rdmabuf); out2: kfree(req); out1: diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 81f0e03b71b6..3095442b0382 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -496,8 +496,8 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, int flags, struct rpc_rqst *req) { struct xdr_buf *buf = &req->rq_private_buf; - size_t want, uninitialized_var(read); - ssize_t uninitialized_var(ret); + size_t want, read; + ssize_t ret; xs_read_header(transport, buf); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 577f71dd63fb..a0bc919e4e47 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1025,6 +1025,12 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) #ifdef CONFIG_TIPC_MEDIA_UDP if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { + if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) { + rtnl_unlock(); + NL_SET_ERR_MSG(info->extack, "UDP option is unsupported"); + return -EINVAL; + } + err = tipc_udp_nl_bearer_add(b, attrs[TIPC_NLA_BEARER_UDP_OPTS]); if (err) { @@ -1195,7 +1201,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) struct tipc_nl_msg msg; struct tipc_media *media; struct sk_buff *rep; - struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + struct nlattr *attrs[TIPC_NLA_MEDIA_MAX + 1]; if (!info->attrs[TIPC_NLA_MEDIA]) return -EINVAL; @@ -1244,7 +1250,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) int err; char *name; struct tipc_media *m; - struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + struct nlattr *attrs[TIPC_NLA_MEDIA_MAX + 1]; if (!info->attrs[TIPC_NLA_MEDIA]) return -EINVAL; diff --git a/net/tipc/core.c b/net/tipc/core.c index 90cf7e0bbaf0..58ee5ee70781 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -112,6 +112,15 @@ static void __net_exit tipc_exit_net(struct net *net) cond_resched(); } +static void __net_exit tipc_pernet_pre_exit(struct net *net) +{ + tipc_node_pre_cleanup_net(net); +} + +static struct pernet_operations tipc_pernet_pre_exit_ops = { + .pre_exit = tipc_pernet_pre_exit, +}; + static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, @@ -150,6 +159,10 @@ static int __init tipc_init(void) if (err) goto out_pernet_topsrv; + err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); + if (err) + goto out_register_pernet_subsys; + err = tipc_bearer_setup(); if (err) goto out_bearer; @@ -170,6 +183,8 @@ out_netlink_compat: out_netlink: tipc_bearer_cleanup(); out_bearer: + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); +out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); @@ -187,6 +202,7 @@ static void __exit tipc_exit(void) tipc_netlink_compat_stop(); tipc_netlink_stop(); tipc_bearer_cleanup(); + unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); diff --git a/net/tipc/core.h b/net/tipc/core.h index c6bda91f8581..59f97ef12e60 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -59,6 +59,7 @@ #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <net/genetlink.h> +#include <net/netns/hash.h> #ifdef pr_fmt #undef pr_fmt @@ -202,6 +203,11 @@ static inline int in_range(u16 val, u16 min, u16 max) return !less(val, min) && !more(val, max); } +static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand) +{ + return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand; +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index c138d68e8a69..9c64567f8a74 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -94,6 +94,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } @@ -146,8 +147,8 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); - bool trial = time_before(jiffies, tn->addr_trial_end); u32 self = tipc_own_addr(net); + bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) @@ -193,6 +194,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); + u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); @@ -208,7 +210,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, u32 self; int err; - skb_linearize(skb); + if (skb_linearize(skb)) { + kfree_skb(skb); + return; + } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) @@ -241,7 +246,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (!tipc_in_scope(legacy, b->domain, src)) return; - tipc_node_check_dest(net, src, peer_id, b, caps, signature, + tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); diff --git a/net/tipc/link.c b/net/tipc/link.c index 8f2ee71c63c6..b653d16ab21f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1971,7 +1971,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); - skb_linearize(skb); + if (skb_linearize(skb)) + goto exit; + hdr = buf_msg(skb); data = msg_data(hdr); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index e7155a774300..0b9ad3b5ff18 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -130,7 +130,7 @@ static void map_set(u64 *up_map, int i, unsigned int v) static int map_get(u64 up_map, int i) { - return (up_map & (1 << i)) >> i; + return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 0daa6f04ca81..6d692546acdc 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -358,6 +358,11 @@ static inline u32 msg_connected(struct tipc_msg *m) return msg_type(m) == TIPC_CONN_MSG; } +static inline u32 msg_direct(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_DIRECT_MSG; +} + static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); @@ -1026,6 +1031,20 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +/* Word 13 + */ +static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 13, n); +} + +static inline u32 msg_peer_net_hash(struct tipc_msg *m) +{ + return msg_word(m, 13); +} + +/* Word 14 + */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 661bc2551a0a..6ac84e7c8b63 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -146,7 +146,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index e9bbf4a00881..5c58aff4d4fa 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -87,7 +87,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING, + [TIPC_NLA_LINK_NAME] = { .type = NLA_NUL_STRING, .len = TIPC_MAX_LINK_NAME }, [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, @@ -118,7 +118,7 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING, + [TIPC_NLA_BEARER_NAME] = { .type = NLA_NUL_STRING, .len = TIPC_MAX_BEARER_NAME }, [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 561ea834f732..5c61b8ee7fc0 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -101,6 +101,7 @@ static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len) return -EMSGSIZE; skb_put(skb, TLV_SPACE(len)); + memset(tlv, 0, TLV_SPACE(len)); tlv->tlv_type = htons(type); tlv->tlv_len = htons(TLV_LENGTH(len)); if (len && data) @@ -857,7 +858,7 @@ static int tipc_nl_compat_name_table_dump_header(struct tipc_nl_compat_msg *msg) }; ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req); - if (TLV_GET_DATA_LEN(msg->req) < sizeof(struct tipc_name_table_query)) + if (TLV_GET_DATA_LEN(msg->req) < (int)sizeof(struct tipc_name_table_query)) return -EINVAL; depth = ntohl(ntq->depth); diff --git a/net/tipc/node.c b/net/tipc/node.c index c8f6177dd5a2..535a4a778640 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -126,6 +126,8 @@ struct tipc_node { struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; + struct net *peer_net; + u32 peer_hash_mix; }; /* Node FSM states and events: @@ -184,7 +186,7 @@ static struct tipc_link *node_active_link(struct tipc_node *n, int sel) return n->links[bearer_id].link; } -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; @@ -194,6 +196,14 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) if (unlikely(!n)) return mtu; + /* Allow MAX_MSG_SIZE when building connection oriented message + * if they are in the same core network + */ + if (n->peer_net && connected) { + tipc_node_put(n); + return mtu; + } + bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; @@ -360,8 +370,37 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } +static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) +{ + int net_id = tipc_netid(n->net); + struct tipc_net *tn_peer; + struct net *tmp; + u32 hash_chk; + + if (n->peer_net) + return; + + for_each_net_rcu(tmp) { + tn_peer = tipc_net(tmp); + if (!tn_peer) + continue; + /* Integrity checking whether node exists in namespace or not */ + if (tn_peer->net_id != net_id) + continue; + if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) + continue; + hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); + if (hash_mixes ^ hash_chk) + continue; + n->peer_net = tmp; + n->peer_hash_mix = hash_mixes; + break; + } +} + static struct tipc_node *tipc_node_create(struct net *net, u32 addr, - u8 *peer_id, u16 capabilities) + u8 *peer_id, u16 capabilities, + u32 signature, u32 hash_mixes) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; @@ -372,6 +411,8 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); if (n) { + if (n->peer_hash_mix ^ hash_mixes) + tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ @@ -389,6 +430,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -399,6 +441,10 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, n->addr = addr; memcpy(&n->peer_id, peer_id, 16); n->net = net; + n->peer_net = NULL; + n->peer_hash_mix = 0; + /* Assign kernel local namespace if exists */ + tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); @@ -979,7 +1025,7 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { @@ -989,8 +1035,9 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool addr_match = false; bool sign_match = false; bool link_up = false; + bool link_is_reset = false; bool accept_addr = false; - bool reset = true; + bool reset = false; char *if_name; unsigned long intv; u16 session; @@ -998,7 +1045,8 @@ void tipc_node_check_dest(struct net *net, u32 addr, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, addr, peer_id, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities, signature, + hash_mixes); if (!n) return; @@ -1009,14 +1057,17 @@ void tipc_node_check_dest(struct net *net, u32 addr, /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); + link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { - /* All is fine. Do nothing. */ - reset = false; + /* All is fine. Ignore requests. */ + /* Peer node is not a container/local namespace */ + if (!n->peer_hash_mix) + n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; @@ -1038,6 +1089,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, */ accept_addr = true; *respond = true; + reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already @@ -1069,6 +1121,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, n->signature = signature; accept_addr = true; *respond = true; + reset = true; } if (!accept_addr) @@ -1097,6 +1150,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); + link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); @@ -1109,7 +1163,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); - if (reset && l && !tipc_link_is_reset(l)) + if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } @@ -1342,7 +1396,8 @@ static void node_lost_contact(struct tipc_node *n, /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; - + n->peer_net = NULL; + n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, @@ -1424,6 +1479,57 @@ msg_full: return -EMSGSIZE; } +static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) +{ + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + struct sk_buff_head inputq; + + switch (msg_user(hdr)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + if (msg_connected(hdr) || msg_named(hdr) || + msg_direct(hdr)) { + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + } + if (msg_mcast(hdr)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + return; + } + return; + case MSG_FRAGMENTER: + if (tipc_msg_assemble(list)) { + tipc_loopback_trace(peer_net, list); + skb_queue_head_init(&inputq); + tipc_sk_mcast_rcv(peer_net, list, &inputq); + __skb_queue_purge(list); + skb_queue_purge(&inputq); + } + return; + case GROUP_PROTOCOL: + case CONN_MANAGER: + tipc_loopback_trace(peer_net, list); + spin_lock_init(&list->lock); + tipc_sk_rcv(peer_net, list); + return; + case LINK_PROTOCOL: + case NAME_DISTRIBUTOR: + case TUNNEL_PROTOCOL: + case BCAST_PROTOCOL: + return; + default: + return; + }; +} + /** * tipc_node_xmit() is the general link level function for message sending * @net: the applicable net namespace @@ -1439,6 +1545,8 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; + bool node_up = false; + struct net *peer_net; int bearer_id; int rc; @@ -1455,6 +1563,22 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, return -EHOSTUNREACH; } + rcu_read_lock(); + tipc_node_read_lock(n); + node_up = node_is_up(n); + peer_net = n->peer_net; + tipc_node_read_unlock(n); + if (node_up && peer_net && check_net(peer_net)) { + /* xmit inner linux container */ + tipc_lxc_xmit(peer_net, list); + if (likely(skb_queue_empty(list))) { + rcu_read_unlock(); + tipc_node_put(n); + return 0; + } + } + rcu_read_unlock(); + tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { @@ -2591,3 +2715,33 @@ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) return i; } + +void tipc_node_pre_cleanup_net(struct net *exit_net) +{ + struct tipc_node *n; + struct tipc_net *tn; + struct net *tmp; + + rcu_read_lock(); + for_each_net_rcu(tmp) { + if (tmp == exit_net) + continue; + tn = tipc_net(tmp); + if (!tn) + continue; + spin_lock_bh(&tn->node_list_lock); + list_for_each_entry_rcu(n, &tn->node_list, list) { + if (!n->peer_net) + continue; + if (n->peer_net != exit_net) + continue; + tipc_node_write_lock(n); + n->peer_net = NULL; + n->peer_hash_mix = 0; + tipc_node_write_unlock_fast(n); + break; + } + spin_unlock_bh(&tn->node_list_lock); + } + rcu_read_unlock(); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 291d0ecd4101..30563c4f35d5 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -75,7 +75,7 @@ u32 tipc_node_get_addr(struct tipc_node *node); u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, - u16 capabilities, u32 signature, + u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); @@ -92,7 +92,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); -int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); @@ -107,4 +107,5 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb); +void tipc_node_pre_cleanup_net(struct net *exit_net); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 58c4d61d603f..cf4f90de566e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -866,7 +866,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1407,7 +1407,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) } __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, true); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1547,7 +1547,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); - tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 444e1792d02c..88e8e8d69b60 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -177,7 +177,7 @@ static void tipc_conn_close(struct tipc_conn *con) conn_put(con); } -static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) +static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock) { struct tipc_conn *con; int ret; @@ -203,10 +203,12 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) } con->conid = ret; s->idr_in_use++; - spin_unlock_bh(&s->idr_lock); set_bit(CF_CONNECTED, &con->flags); con->server = s; + con->sock = sock; + conn_get(con); + spin_unlock_bh(&s->idr_lock); return con; } @@ -450,17 +452,24 @@ static void tipc_conn_data_ready(struct sock *sk) static void tipc_topsrv_accept(struct work_struct *work) { struct tipc_topsrv *srv = container_of(work, struct tipc_topsrv, awork); - struct socket *lsock = srv->listener; - struct socket *newsock; + struct socket *newsock, *lsock; struct tipc_conn *con; struct sock *newsk; int ret; + spin_lock_bh(&srv->idr_lock); + if (!srv->listener) { + spin_unlock_bh(&srv->idr_lock); + return; + } + lsock = srv->listener; + spin_unlock_bh(&srv->idr_lock); + while (1) { ret = kernel_accept(lsock, &newsock, O_NONBLOCK); if (ret < 0) return; - con = tipc_conn_alloc(srv); + con = tipc_conn_alloc(srv, newsock); if (IS_ERR(con)) { ret = PTR_ERR(con); sock_release(newsock); @@ -472,11 +481,11 @@ static void tipc_topsrv_accept(struct work_struct *work) newsk->sk_data_ready = tipc_conn_data_ready; newsk->sk_write_space = tipc_conn_write_space; newsk->sk_user_data = con; - con->sock = newsock; write_unlock_bh(&newsk->sk_callback_lock); /* Wake up receive process in case of 'SYN+' message */ newsk->sk_data_ready(newsk); + conn_put(con); } } @@ -489,7 +498,7 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); srv = sk->sk_user_data; - if (srv->listener) + if (srv) queue_work(srv->rcv_wq, &srv->awork); read_unlock_bh(&sk->sk_callback_lock); } @@ -568,19 +577,19 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, sub.seq.upper = upper; sub.timeout = TIPC_WAIT_FOREVER; sub.filter = filter; - *(u32 *)&sub.usr_handle = port; + *(u64 *)&sub.usr_handle = (u64)port; - con = tipc_conn_alloc(tipc_topsrv(net)); + con = tipc_conn_alloc(tipc_topsrv(net), NULL); if (IS_ERR(con)) return false; *conid = con->conid; - con->sock = NULL; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); - if (rc >= 0) - return true; + if (rc) + conn_put(con); + conn_put(con); - return false; + return !rc; } void tipc_topsrv_kern_unsubscr(struct net *net, int conid) @@ -699,8 +708,9 @@ static void tipc_topsrv_stop(struct net *net) __module_get(lsock->sk->sk_prot_creator->owner); srv->listener = NULL; spin_unlock_bh(&srv->idr_lock); - sock_release(lsock); + tipc_topsrv_work_stop(srv); + sock_release(lsock); idr_destroy(&srv->conn_idr); kfree(srv); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7aba4ee77aba..cb51a2f46b11 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -371,13 +371,11 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, rc = -EINVAL; goto out; } - lock_sock(sk); memcpy(crypto_info_aes_gcm_128->iv, ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq, TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); - release_sock(sk); if (copy_to_user(optval, crypto_info_aes_gcm_128, sizeof(*crypto_info_aes_gcm_128))) @@ -395,13 +393,11 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, rc = -EINVAL; goto out; } - lock_sock(sk); memcpy(crypto_info_aes_gcm_256->iv, ctx->tx.iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE, TLS_CIPHER_AES_GCM_256_IV_SIZE); memcpy(crypto_info_aes_gcm_256->rec_seq, ctx->tx.rec_seq, TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE); - release_sock(sk); if (copy_to_user(optval, crypto_info_aes_gcm_256, sizeof(*crypto_info_aes_gcm_256))) @@ -421,6 +417,8 @@ static int do_tls_getsockopt(struct sock *sk, int optname, { int rc = 0; + lock_sock(sk); + switch (optname) { case TLS_TX: rc = do_tls_getsockopt_tx(sk, optval, optlen); @@ -429,6 +427,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, rc = -ENOPROTOOPT; break; } + + release_sock(sk); + return rc; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index af3be9a29d6d..910da98d6bfb 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -677,7 +677,7 @@ static int tls_push_record(struct sock *sk, int flags, struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec, *tmp = NULL; - u32 i, split_point, uninitialized_var(orig_end); + u32 i, split_point, orig_end; struct sk_msg *msg_pl, *msg_en; struct aead_request *req; bool split; @@ -807,7 +807,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, psock = sk_psock_get(sk); if (!psock || !policy) { err = tls_push_record(sk, flags, record_type); - if (err && sk->sk_err == EBADMSG) { + if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; @@ -836,7 +836,7 @@ more_data: switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); - if (err && sk->sk_err == EBADMSG) { + if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; @@ -945,7 +945,9 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - mutex_lock(&tls_ctx->tx_lock); + ret = mutex_lock_interruptible(&tls_ctx->tx_lock); + if (ret) + return ret; lock_sock(sk); if (unlikely(msg->msg_controllen)) { @@ -1209,6 +1211,8 @@ alloc_payload: } sk_msg_page_add(msg_pl, page, copy, offset); + msg_pl->sg.copybreak = 0; + msg_pl->sg.curr = msg_pl->sg.end; sk_mem_charge(sk, copy); offset += copy; @@ -1279,7 +1283,9 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; - mutex_lock(&tls_ctx->tx_lock); + ret = mutex_lock_interruptible(&tls_ctx->tx_lock); + if (ret) + return ret; lock_sock(sk); ret = tls_sw_do_sendpage(sk, page, offset, size, flags); release_sock(sk); @@ -1742,6 +1748,7 @@ int tls_sw_recvmsg(struct sock *sk, struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; struct sk_psock *psock; + int num_async, pending; unsigned char control = 0; ssize_t decrypted = 0; struct strp_msg *rxm; @@ -1754,8 +1761,6 @@ int tls_sw_recvmsg(struct sock *sk, bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; bool bpf_strp_enabled; - int num_async = 0; - int pending; flags |= nonblock; @@ -1772,17 +1777,18 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) { tls_err_abort(sk, err); goto end; - } else { - copied = err; } - if (len <= copied) - goto recv_end; + copied = err; + if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA)) + goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); len = len - copied; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + decrypted = 0; + num_async = 0; while (len && (decrypted + copied < target || ctx->recv_pkt)) { bool retain_skb = false; bool zc = false; @@ -2263,11 +2269,19 @@ static void tx_work_handler(struct work_struct *work) if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) return; - mutex_lock(&tls_ctx->tx_lock); - lock_sock(sk); - tls_tx_records(sk, -1); - release_sock(sk); - mutex_unlock(&tls_ctx->tx_lock); + + if (mutex_trylock(&tls_ctx->tx_lock)) { + lock_sock(sk); + tls_tx_records(sk, -1); + release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); + } else if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + /* Someone is holding the tx_lock, they will likely run Tx + * and cancel the work on their way out of the lock section. + * Schedule a long delay just in case. + */ + schedule_delayed_work(&ctx->tx_work.work, msecs_to_jiffies(10)); + } } void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f33e90bd0683..53335989a6f0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -531,7 +531,7 @@ static void unix_release_sock(struct sock *sk, int embrion) /* Clear state */ unix_state_lock(sk); sock_orphan(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; @@ -549,7 +549,7 @@ static void unix_release_sock(struct sock *sk, int embrion) if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); /* No more writes */ - skpair->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) skpair->sk_err = ECONNRESET; unix_state_unlock(skpair); @@ -589,7 +589,7 @@ static void unix_release_sock(struct sock *sk, int embrion) * What the above comment does talk about? --ANK(980817) */ - if (unix_tot_inflight) + if (READ_ONCE(unix_tot_inflight)) unix_gc(); /* Garbage collect fds */ } @@ -701,7 +701,7 @@ static int unix_set_peek_off(struct sock *sk, int val) if (mutex_lock_interruptible(&u->iolock)) return -EINTR; - sk->sk_peek_off = val; + WRITE_ONCE(sk->sk_peek_off, val); mutex_unlock(&u->iolock); return 0; @@ -809,11 +809,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; - u = unix_sk(sk); + u = unix_sk(sk); + u->inflight = 0; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ @@ -1118,13 +1118,11 @@ static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) unix_state_lock(sk1); return; } - if (sk1 < sk2) { - unix_state_lock(sk1); - unix_state_lock_nested(sk2); - } else { - unix_state_lock(sk2); - unix_state_lock_nested(sk1); - } + if (sk1 > sk2) + swap(sk1, sk2); + + unix_state_lock(sk1); + unix_state_lock_nested(sk2, U_LOCK_SECOND); } static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) @@ -1227,7 +1225,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && - unix_recvq_full(other); + unix_recvq_full_lockless(other); unix_state_unlock(other); @@ -1343,7 +1341,7 @@ restart: goto out_unlock; } - unix_state_lock_nested(sk); + unix_state_lock_nested(sk, U_LOCK_SECOND); if (sk->sk_state != st) { unix_state_unlock(sk); @@ -1979,6 +1977,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, if (false) { alloc_skb: + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); mutex_unlock(&unix_sk(other)->iolock); newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, @@ -2018,6 +2017,7 @@ alloc_skb: init_scm = false; } + spin_lock(&other->sk_receive_queue.lock); skb = skb_peek_tail(&other->sk_receive_queue); if (tail && tail == skb) { skb = newskb; @@ -2048,14 +2048,11 @@ alloc_skb: refcount_add(size, &sk->sk_wmem_alloc); if (newskb) { - err = unix_scm_to_skb(&scm, skb, false); - if (err) - goto err_state_unlock; - spin_lock(&other->sk_receive_queue.lock); + unix_scm_to_skb(&scm, skb, false); __skb_queue_tail(&other->sk_receive_queue, newskb); - spin_unlock(&other->sk_receive_queue.lock); } + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); mutex_unlock(&unix_sk(other)->iolock); @@ -2546,7 +2543,7 @@ static int unix_shutdown(struct socket *sock, int mode) ++mode; unix_state_lock(sk); - sk->sk_shutdown |= mode; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); @@ -2563,7 +2560,7 @@ static int unix_shutdown(struct socket *sock, int mode) if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); - other->sk_shutdown |= peer_mode; + WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) @@ -2682,16 +2679,18 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa { struct sock *sk = sock->sk; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; + shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (sk->sk_err) mask |= EPOLLERR; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ @@ -2719,18 +2718,20 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int writable; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; + shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ diff --git a/net/unix/diag.c b/net/unix/diag.c index 9ff64f9df1f3..2975e7a061d0 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -83,7 +83,7 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb) * queue lock. With the other's queue locked it's * OK to lock the state. */ - unix_state_lock_nested(req); + unix_state_lock_nested(req, U_LOCK_DIAG); peer = unix_sk(req)->peer; buf[i++] = (peer ? sock_i_ino(peer) : 0); unix_state_unlock(req); @@ -113,14 +113,16 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } -static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb) +static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb, + struct user_namespace *user_ns) { - uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk)); + uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk)); return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, - u32 portid, u32 seq, u32 flags, int sk_ino) + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct unix_diag_msg *rep; @@ -166,7 +168,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_UID) && - sk_diag_dump_uid(sk, skb)) + sk_diag_dump_uid(sk, skb, user_ns)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); @@ -178,7 +180,8 @@ out_nlmsg_trim: } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, - u32 portid, u32 seq, u32 flags) + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags) { int sk_ino; @@ -189,7 +192,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if (!sk_ino) return 0; - return sk_diag_fill(sk, skb, req, portid, seq, flags, sk_ino); + return sk_diag_fill(sk, skb, req, user_ns, portid, seq, flags, sk_ino); } static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) @@ -217,7 +220,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto next; if (!(req->udiag_states & (1 << sk->sk_state))) goto next; - if (sk_diag_dump(sk, skb, req, + if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) @@ -285,7 +288,8 @@ again: if (!rep) goto out; - err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).portid, + err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, req->udiag_ino); if (err < 0) { nlmsg_free(rep); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d45d5366115a..133ba5be4b58 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -166,17 +166,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), static void dec_inflight(struct unix_sock *usk) { - atomic_long_dec(&usk->inflight); + usk->inflight--; } static void inc_inflight(struct unix_sock *usk) { - atomic_long_inc(&usk->inflight); + usk->inflight++; } static void inc_inflight_move_tail(struct unix_sock *u) { - atomic_long_inc(&u->inflight); + u->inflight++; + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over @@ -198,12 +199,13 @@ void wait_for_unix_gc(void) if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && !READ_ONCE(gc_in_progress)) unix_gc(); - wait_event(unix_gc_wait, gc_in_progress == false); + wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); } /* The external entry point: unix_gc() */ void unix_gc(void) { + struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; @@ -233,20 +235,34 @@ void unix_gc(void) * receive queues. Other, non candidate sockets _can_ be * added to queue, so we must make sure only to touch * candidates. + * + * Embryos, though never candidates themselves, affect which + * candidates are reachable by the garbage collector. Before + * being added to a listener's queue, an embryo may already + * receive data carrying SCM_RIGHTS, potentially making the + * passed socket a candidate that is not yet reachable by the + * collector. It becomes reachable once the embryo is + * enqueued. Therefore, we must ensure that no SCM-laden + * embryo appears in a (candidate) listener's queue between + * consecutive scan_children() calls. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { + struct sock *sk = &u->sk; long total_refs; - long inflight_refs; - total_refs = file_count(u->sk.sk_socket->file); - inflight_refs = atomic_long_read(&u->inflight); + total_refs = file_count(sk->sk_socket->file); - BUG_ON(inflight_refs < 1); - BUG_ON(total_refs < inflight_refs); - if (total_refs == inflight_refs) { + BUG_ON(!u->inflight); + BUG_ON(total_refs < u->inflight); + if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); + + if (sk->sk_state == TCP_LISTEN) { + unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); + unix_state_unlock(sk); + } } } @@ -270,7 +286,7 @@ void unix_gc(void) /* Move cursor to after the current position. */ list_move(&cursor, &u->link); - if (atomic_long_read(&u->inflight) > 0) { + if (u->inflight) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); @@ -297,11 +313,30 @@ void unix_gc(void) spin_unlock(&unix_gc_lock); + /* We need io_uring to clean its registered files, ignore all io_uring + * originated skbs. It's fine as io_uring doesn't keep references to + * other io_uring instances and so killing all other files in the cycle + * will put all io_uring references forcing it to go through normal + * release.path eventually putting registered files. + */ + skb_queue_walk_safe(&hitlist, skb, next_skb) { + if (skb->scm_io_uring) { + __skb_unlink(skb, &hitlist); + skb_queue_tail(&skb->sk->sk_receive_queue, skb); + } + } + /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); + /* There could be io_uring registered files, just push them back to + * the inflight list + */ + list_for_each_entry_safe(u, next, &gc_candidates, link) + list_move_tail(&u->link, &gc_inflight_list); + /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); diff --git a/net/unix/scm.c b/net/unix/scm.c index ce700b22ecce..785e8c4669e2 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -33,10 +33,8 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; - } else { - /* Could be an io_uring instance */ - u_sock = io_uring_get_socket(filp); } + return u_sock; } EXPORT_SYMBOL(unix_get_socket); @@ -53,16 +51,17 @@ void unix_inflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); - if (atomic_long_inc_return(&u->inflight) == 1) { + if (!u->inflight) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); } else { BUG_ON(list_empty(&u->link)); } + u->inflight++; /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } - user->unix_inflight++; + WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); spin_unlock(&unix_gc_lock); } @@ -75,15 +74,16 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); - BUG_ON(!atomic_long_read(&u->inflight)); + BUG_ON(!u->inflight); BUG_ON(list_empty(&u->link)); - if (atomic_long_dec_and_test(&u->inflight)) + u->inflight--; + if (!u->inflight) list_del_init(&u->link); /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); } - user->unix_inflight--; + WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); spin_unlock(&unix_gc_lock); } @@ -97,7 +97,7 @@ static inline bool too_many_unix_fds(struct task_struct *p) { struct user_struct *user = current_user(); - if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); return false; } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index d60d7caacbf5..4cd65a1a07f9 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1110,6 +1110,7 @@ static void vsock_connect_timeout(struct work_struct *work) if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; + sk->sk_socket->state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); vsock_transport_cancel_pkt(vsk); @@ -1207,7 +1208,14 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, * timeout fires. */ sock_hold(sk); - schedule_delayed_work(&vsk->connect_work, timeout); + + /* If the timeout function is already scheduled, + * reschedule it, then ungrab the socket refcount to + * keep it balanced. + */ + if (mod_delayed_work(system_wq, &vsk->connect_work, + timeout)) + sock_put(sk); /* Skip ahead to preserve error code set above. */ goto out_wait; @@ -1224,7 +1232,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, vsock_transport_cancel_pkt(vsk); vsock_remove_connected(vsk); goto out_wait; - } else if (timeout == 0) { + } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { err = -ETIMEDOUT; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index dde16a033a09..434c5608a75d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -276,6 +276,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; size_t bytes, total = 0; u32 free_space; + u32 fwd_cnt_delta; + bool low_rx_bytes; int err = -EFAULT; spin_lock_bh(&vvs->rx_lock); @@ -307,7 +309,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } } - free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt); + fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; + free_space = vvs->buf_alloc - fwd_cnt_delta; + low_rx_bytes = (vvs->rx_bytes < + sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); spin_unlock_bh(&vvs->rx_lock); @@ -317,9 +322,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * too high causes extra messages. Too low causes transmitter * stalls. As stalls are in theory more expensive than extra * messages, we set the limit to a high value. TODO: experiment - * with different values. + * with different values. Also send credit update message when + * number of bytes in rx queue is not enough to wake up reader. */ - if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { + if (fwd_cnt_delta && + (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes)) { virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, NULL); @@ -372,7 +379,7 @@ static s64 virtio_transport_has_space(struct vsock_sock *vsk) struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; - bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); if (bytes < 0) bytes = 0; @@ -1146,7 +1153,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) { - kfree(pkt->buf); + kvfree(pkt->buf); kfree(pkt); } EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index aaabcd84268a..85488e19dffc 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1725,7 +1725,11 @@ static int vmci_transport_dgram_enqueue( if (!dg) return -ENOMEM; - memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len); + err = memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len); + if (err) { + kfree(dg); + return err; + } dg->dst = vmci_make_handle(remote_addr->svm_cid, remote_addr->svm_port); diff --git a/net/wireless/certs/wens.hex b/net/wireless/certs/wens.hex new file mode 100644 index 000000000000..0d50369bede9 --- /dev/null +++ b/net/wireless/certs/wens.hex @@ -0,0 +1,87 @@ +/* Chen-Yu Tsai's regdb certificate */ +0x30, 0x82, 0x02, 0xa7, 0x30, 0x82, 0x01, 0x8f, +0x02, 0x14, 0x61, 0xc0, 0x38, 0x65, 0x1a, 0xab, +0xdc, 0xf9, 0x4b, 0xd0, 0xac, 0x7f, 0xf0, 0x6c, +0x72, 0x48, 0xdb, 0x18, 0xc6, 0x00, 0x30, 0x0d, +0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, +0x01, 0x01, 0x0b, 0x05, 0x00, 0x30, 0x0f, 0x31, +0x0d, 0x30, 0x0b, 0x06, 0x03, 0x55, 0x04, 0x03, +0x0c, 0x04, 0x77, 0x65, 0x6e, 0x73, 0x30, 0x20, +0x17, 0x0d, 0x32, 0x33, 0x31, 0x32, 0x30, 0x31, +0x30, 0x37, 0x34, 0x31, 0x31, 0x34, 0x5a, 0x18, +0x0f, 0x32, 0x31, 0x32, 0x33, 0x31, 0x31, 0x30, +0x37, 0x30, 0x37, 0x34, 0x31, 0x31, 0x34, 0x5a, +0x30, 0x0f, 0x31, 0x0d, 0x30, 0x0b, 0x06, 0x03, +0x55, 0x04, 0x03, 0x0c, 0x04, 0x77, 0x65, 0x6e, +0x73, 0x30, 0x82, 0x01, 0x22, 0x30, 0x0d, 0x06, +0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, +0x01, 0x01, 0x05, 0x00, 0x03, 0x82, 0x01, 0x0f, +0x00, 0x30, 0x82, 0x01, 0x0a, 0x02, 0x82, 0x01, +0x01, 0x00, 0xa9, 0x7a, 0x2c, 0x78, 0x4d, 0xa7, +0x19, 0x2d, 0x32, 0x52, 0xa0, 0x2e, 0x6c, 0xef, +0x88, 0x7f, 0x15, 0xc5, 0xb6, 0x69, 0x54, 0x16, +0x43, 0x14, 0x79, 0x53, 0xb7, 0xae, 0x88, 0xfe, +0xc0, 0xb7, 0x5d, 0x47, 0x8e, 0x1a, 0xe1, 0xef, +0xb3, 0x90, 0x86, 0xda, 0xd3, 0x64, 0x81, 0x1f, +0xce, 0x5d, 0x9e, 0x4b, 0x6e, 0x58, 0x02, 0x3e, +0xb2, 0x6f, 0x5e, 0x42, 0x47, 0x41, 0xf4, 0x2c, +0xb8, 0xa8, 0xd4, 0xaa, 0xc0, 0x0e, 0xe6, 0x48, +0xf0, 0xa8, 0xce, 0xcb, 0x08, 0xae, 0x37, 0xaf, +0xf6, 0x40, 0x39, 0xcb, 0x55, 0x6f, 0x5b, 0x4f, +0x85, 0x34, 0xe6, 0x69, 0x10, 0x50, 0x72, 0x5e, +0x4e, 0x9d, 0x4c, 0xba, 0x38, 0x36, 0x0d, 0xce, +0x73, 0x38, 0xd7, 0x27, 0x02, 0x2a, 0x79, 0x03, +0xe1, 0xac, 0xcf, 0xb0, 0x27, 0x85, 0x86, 0x93, +0x17, 0xab, 0xec, 0x42, 0x77, 0x37, 0x65, 0x8a, +0x44, 0xcb, 0xd6, 0x42, 0x93, 0x92, 0x13, 0xe3, +0x39, 0x45, 0xc5, 0x6e, 0x00, 0x4a, 0x7f, 0xcb, +0x42, 0x17, 0x2b, 0x25, 0x8c, 0xb8, 0x17, 0x3b, +0x15, 0x36, 0x59, 0xde, 0x42, 0xce, 0x21, 0xe6, +0xb6, 0xc7, 0x6e, 0x5e, 0x26, 0x1f, 0xf7, 0x8a, +0x57, 0x9e, 0xa5, 0x96, 0x72, 0xb7, 0x02, 0x32, +0xeb, 0x07, 0x2b, 0x73, 0xe2, 0x4f, 0x66, 0x58, +0x9a, 0xeb, 0x0f, 0x07, 0xb6, 0xab, 0x50, 0x8b, +0xc3, 0x8f, 0x17, 0xfa, 0x0a, 0x99, 0xc2, 0x16, +0x25, 0xbf, 0x2d, 0x6b, 0x1a, 0xaa, 0xe6, 0x3e, +0x5f, 0xeb, 0x6d, 0x9b, 0x5d, 0x4d, 0x42, 0x83, +0x2d, 0x39, 0xb8, 0xc9, 0xac, 0xdb, 0x3a, 0x91, +0x50, 0xdf, 0xbb, 0xb1, 0x76, 0x6d, 0x15, 0x73, +0xfd, 0xc6, 0xe6, 0x6b, 0x71, 0x9e, 0x67, 0x36, +0x22, 0x83, 0x79, 0xb1, 0xd6, 0xb8, 0x84, 0x52, +0xaf, 0x96, 0x5b, 0xc3, 0x63, 0x02, 0x4e, 0x78, +0x70, 0x57, 0x02, 0x03, 0x01, 0x00, 0x01, 0x30, +0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, +0x0d, 0x01, 0x01, 0x0b, 0x05, 0x00, 0x03, 0x82, +0x01, 0x01, 0x00, 0x24, 0x28, 0xee, 0x22, 0x74, +0x7f, 0x7c, 0xfa, 0x6c, 0x1f, 0xb3, 0x18, 0xd1, +0xc2, 0x3d, 0x7d, 0x29, 0x42, 0x88, 0xad, 0x82, +0xa5, 0xb1, 0x8a, 0x05, 0xd0, 0xec, 0x5c, 0x91, +0x20, 0xf6, 0x82, 0xfd, 0xd5, 0x67, 0x60, 0x5f, +0x31, 0xf5, 0xbd, 0x88, 0x91, 0x70, 0xbd, 0xb8, +0xb9, 0x8c, 0x88, 0xfe, 0x53, 0xc9, 0x54, 0x9b, +0x43, 0xc4, 0x7a, 0x43, 0x74, 0x6b, 0xdd, 0xb0, +0xb1, 0x3b, 0x33, 0x45, 0x46, 0x78, 0xa3, 0x1c, +0xef, 0x54, 0x68, 0xf7, 0x85, 0x9c, 0xe4, 0x51, +0x6f, 0x06, 0xaf, 0x81, 0xdb, 0x2a, 0x7b, 0x7b, +0x6f, 0xa8, 0x9c, 0x67, 0xd8, 0xcb, 0xc9, 0x91, +0x40, 0x00, 0xae, 0xd9, 0xa1, 0x9f, 0xdd, 0xa6, +0x43, 0x0e, 0x28, 0x7b, 0xaa, 0x1b, 0xe9, 0x84, +0xdb, 0x76, 0x64, 0x42, 0x70, 0xc9, 0xc0, 0xeb, +0xae, 0x84, 0x11, 0x16, 0x68, 0x4e, 0x84, 0x9e, +0x7e, 0x92, 0x36, 0xee, 0x1c, 0x3b, 0x08, 0x63, +0xeb, 0x79, 0x84, 0x15, 0x08, 0x9d, 0xaf, 0xc8, +0x9a, 0xc7, 0x34, 0xd3, 0x94, 0x4b, 0xd1, 0x28, +0x97, 0xbe, 0xd1, 0x45, 0x75, 0xdc, 0x35, 0x62, +0xac, 0x1d, 0x1f, 0xb7, 0xb7, 0x15, 0x87, 0xc8, +0x98, 0xc0, 0x24, 0x31, 0x56, 0x8d, 0xed, 0xdb, +0x06, 0xc6, 0x46, 0xbf, 0x4b, 0x6d, 0xa6, 0xd5, +0xab, 0xcc, 0x60, 0xfc, 0xe5, 0x37, 0xb6, 0x53, +0x7d, 0x58, 0x95, 0xa9, 0x56, 0xc7, 0xf7, 0xee, +0xc3, 0xa0, 0x76, 0xf7, 0x65, 0x4d, 0x53, 0xfa, +0xff, 0x5f, 0x76, 0x33, 0x5a, 0x08, 0xfa, 0x86, +0x92, 0x5a, 0x13, 0xfa, 0x1a, 0xfc, 0xf2, 0x1b, +0x8c, 0x7f, 0x42, 0x6d, 0xb7, 0x7e, 0xb7, 0xb4, +0xf0, 0xc7, 0x83, 0xbb, 0xa2, 0x81, 0x03, 0x2d, +0xd4, 0x2a, 0x63, 0x3f, 0xf7, 0x31, 0x2e, 0x40, +0x33, 0x5c, 0x46, 0xbc, 0x9b, 0xc1, 0x05, 0xa5, +0x45, 0x4e, 0xc3, diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 76b845f68ac8..d80b06d66959 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -65,9 +65,10 @@ static ssize_t ht40allow_map_read(struct file *file, { struct wiphy *wiphy = file->private_data; char *buf; - unsigned int offset = 0, buf_size = PAGE_SIZE, i, r; + unsigned int offset = 0, buf_size = PAGE_SIZE, i; enum nl80211_band band; struct ieee80211_supported_band *sband; + ssize_t r; buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8459f5b6002e..c698fc458f5f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3350,6 +3350,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * if_idx++; } + if_start = 0; wp_idx++; } out: @@ -3526,6 +3527,8 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (ntype != NL80211_IFTYPE_MESH_POINT) return -EINVAL; + if (otype != NL80211_IFTYPE_MESH_POINT) + return -EINVAL; if (netif_running(dev)) return -EBUSY; @@ -6914,7 +6917,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct mesh_config cfg; + struct mesh_config cfg = {}; u32 mask; int err; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 74caece77963..1f5ea82b58bf 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1060,6 +1060,8 @@ static void regdb_fw_cb(const struct firmware *fw, void *context) static int query_regdb_file(const char *alpha2) { + int err; + ASSERT_RTNL(); if (regdb) @@ -1069,9 +1071,13 @@ static int query_regdb_file(const char *alpha2) if (!alpha2) return -ENOMEM; - return request_firmware_nowait(THIS_MODULE, true, "regulatory.db", - ®_pdev->dev, GFP_KERNEL, - (void *)alpha2, regdb_fw_cb); + err = request_firmware_nowait(THIS_MODULE, true, "regulatory.db", + ®_pdev->dev, GFP_KERNEL, + (void *)alpha2, regdb_fw_cb); + if (err) + kfree(alpha2); + + return err; } int reg_reload_regdb(void) @@ -3964,8 +3970,10 @@ static int __init regulatory_init_db(void) return -EINVAL; err = load_builtin_regdb_keys(); - if (err) + if (err) { + platform_device_unregister(reg_pdev); return err; + } /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 6bb9437af28b..a1c53d4b6711 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -104,18 +104,12 @@ static inline void bss_ref_get(struct cfg80211_registered_device *rdev, lockdep_assert_held(&rdev->bss_lock); bss->refcount++; - if (bss->pub.hidden_beacon_bss) { - bss = container_of(bss->pub.hidden_beacon_bss, - struct cfg80211_internal_bss, - pub); - bss->refcount++; - } - if (bss->pub.transmitted_bss) { - bss = container_of(bss->pub.transmitted_bss, - struct cfg80211_internal_bss, - pub); - bss->refcount++; - } + + if (bss->pub.hidden_beacon_bss) + bss_from_pub(bss->pub.hidden_beacon_bss)->refcount++; + + if (bss->pub.transmitted_bss) + bss_from_pub(bss->pub.transmitted_bss)->refcount++; } static inline void bss_ref_put(struct cfg80211_registered_device *rdev, @@ -229,114 +223,152 @@ bool cfg80211_is_element_inherited(const struct element *elem, } EXPORT_SYMBOL(cfg80211_is_element_inherited); -static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, - const u8 *subelement, size_t subie_len, - u8 *new_ie, gfp_t gfp) +static size_t cfg80211_copy_elem_with_frags(const struct element *elem, + const u8 *ie, size_t ie_len, + u8 **pos, u8 *buf, size_t buf_len) { - u8 *pos, *tmp; - const u8 *tmp_old, *tmp_new; - const struct element *non_inherit_elem; - u8 *sub_copy; + if (WARN_ON((u8 *)elem < ie || elem->data > ie + ie_len || + elem->data + elem->datalen > ie + ie_len)) + return 0; - /* copy subelement as we need to change its content to - * mark an ie after it is processed. - */ - sub_copy = kmemdup(subelement, subie_len, gfp); - if (!sub_copy) + if (elem->datalen + 2 > buf + buf_len - *pos) return 0; - pos = &new_ie[0]; + memcpy(*pos, elem, elem->datalen + 2); + *pos += elem->datalen + 2; - /* set new ssid */ - tmp_new = cfg80211_find_ie(WLAN_EID_SSID, sub_copy, subie_len); - if (tmp_new) { - memcpy(pos, tmp_new, tmp_new[1] + 2); - pos += (tmp_new[1] + 2); + /* Finish if it is not fragmented */ + if (elem->datalen != 255) + return *pos - buf; + + ie_len = ie + ie_len - elem->data - elem->datalen; + ie = (const u8 *)elem->data + elem->datalen; + + for_each_element(elem, ie, ie_len) { + if (elem->id != WLAN_EID_FRAGMENT) + break; + + if (elem->datalen + 2 > buf + buf_len - *pos) + return 0; + + memcpy(*pos, elem, elem->datalen + 2); + *pos += elem->datalen + 2; + + if (elem->datalen != 255) + break; } - /* get non inheritance list if exists */ - non_inherit_elem = - cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - sub_copy, subie_len); + return *pos - buf; +} - /* go through IEs in ie (skip SSID) and subelement, - * merge them into new_ie +static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, + const u8 *subie, size_t subie_len, + u8 *new_ie, size_t new_ie_len) +{ + const struct element *non_inherit_elem, *parent, *sub; + u8 *pos = new_ie; + u8 id, ext_id; + unsigned int match_len; + + non_inherit_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + subie, subie_len); + + /* We copy the elements one by one from the parent to the generated + * elements. + * If they are not inherited (included in subie or in the non + * inheritance element), then we copy all occurrences the first time + * we see this element type. */ - tmp_old = cfg80211_find_ie(WLAN_EID_SSID, ie, ielen); - tmp_old = (tmp_old) ? tmp_old + tmp_old[1] + 2 : ie; - - while (tmp_old + tmp_old[1] + 2 - ie <= ielen) { - if (tmp_old[0] == 0) { - tmp_old++; + for_each_element(parent, ie, ielen) { + if (parent->id == WLAN_EID_FRAGMENT) continue; + + if (parent->id == WLAN_EID_EXTENSION) { + if (parent->datalen < 1) + continue; + + id = WLAN_EID_EXTENSION; + ext_id = parent->data[0]; + match_len = 1; + } else { + id = parent->id; + match_len = 0; } - if (tmp_old[0] == WLAN_EID_EXTENSION) - tmp = (u8 *)cfg80211_find_ext_ie(tmp_old[2], sub_copy, - subie_len); - else - tmp = (u8 *)cfg80211_find_ie(tmp_old[0], sub_copy, - subie_len); + /* Find first occurrence in subie */ + sub = cfg80211_find_elem_match(id, subie, subie_len, + &ext_id, match_len, 0); - if (!tmp) { - const struct element *old_elem = (void *)tmp_old; + /* Copy from parent if not in subie and inherited */ + if (!sub && + cfg80211_is_element_inherited(parent, non_inherit_elem)) { + if (!cfg80211_copy_elem_with_frags(parent, + ie, ielen, + &pos, new_ie, + new_ie_len)) + return 0; - /* ie in old ie but not in subelement */ - if (cfg80211_is_element_inherited(old_elem, - non_inherit_elem)) { - memcpy(pos, tmp_old, tmp_old[1] + 2); - pos += tmp_old[1] + 2; - } - } else { - /* ie in transmitting ie also in subelement, - * copy from subelement and flag the ie in subelement - * as copied (by setting eid field to WLAN_EID_SSID, - * which is skipped anyway). - * For vendor ie, compare OUI + type + subType to - * determine if they are the same ie. - */ - if (tmp_old[0] == WLAN_EID_VENDOR_SPECIFIC) { - if (!memcmp(tmp_old + 2, tmp + 2, 5)) { - /* same vendor ie, copy from - * subelement - */ - memcpy(pos, tmp, tmp[1] + 2); - pos += tmp[1] + 2; - tmp[0] = WLAN_EID_SSID; - } else { - memcpy(pos, tmp_old, tmp_old[1] + 2); - pos += tmp_old[1] + 2; - } - } else { - /* copy ie from subelement into new ie */ - memcpy(pos, tmp, tmp[1] + 2); - pos += tmp[1] + 2; - tmp[0] = WLAN_EID_SSID; - } + continue; } - if (tmp_old + tmp_old[1] + 2 - ie == ielen) - break; + /* Already copied if an earlier element had the same type */ + if (cfg80211_find_elem_match(id, ie, (u8 *)parent - ie, + &ext_id, match_len, 0)) + continue; - tmp_old += tmp_old[1] + 2; + /* Not inheriting, copy all similar elements from subie */ + while (sub) { + if (!cfg80211_copy_elem_with_frags(sub, + subie, subie_len, + &pos, new_ie, + new_ie_len)) + return 0; + + sub = cfg80211_find_elem_match(id, + sub->data + sub->datalen, + subie_len + subie - + (sub->data + + sub->datalen), + &ext_id, match_len, 0); + } } - /* go through subelement again to check if there is any ie not - * copied to new ie, skip ssid, capability, bssid-index ie + /* The above misses elements that are included in subie but not in the + * parent, so do a pass over subie and append those. + * Skip the non-tx BSSID caps and non-inheritance element. */ - tmp_new = sub_copy; - while (tmp_new + tmp_new[1] + 2 - sub_copy <= subie_len) { - if (!(tmp_new[0] == WLAN_EID_NON_TX_BSSID_CAP || - tmp_new[0] == WLAN_EID_SSID)) { - memcpy(pos, tmp_new, tmp_new[1] + 2); - pos += tmp_new[1] + 2; + for_each_element(sub, subie, subie_len) { + if (sub->id == WLAN_EID_NON_TX_BSSID_CAP) + continue; + + if (sub->id == WLAN_EID_FRAGMENT) + continue; + + if (sub->id == WLAN_EID_EXTENSION) { + if (sub->datalen < 1) + continue; + + id = WLAN_EID_EXTENSION; + ext_id = sub->data[0]; + match_len = 1; + + if (ext_id == WLAN_EID_EXT_NON_INHERITANCE) + continue; + } else { + id = sub->id; + match_len = 0; } - if (tmp_new + tmp_new[1] + 2 - sub_copy == subie_len) - break; - tmp_new += tmp_new[1] + 2; + + /* Processed if one was included in the parent */ + if (cfg80211_find_elem_match(id, ie, ielen, + &ext_id, match_len, 0)) + continue; + + if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len, + &pos, new_ie, new_ie_len)) + return 0; } - kfree(sub_copy); return pos - new_ie; } @@ -390,6 +422,15 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, rcu_read_unlock(); + /* + * This is a bit weird - it's not on the list, but already on another + * one! The only way that could happen is if there's some BSSID/SSID + * shared by multiple APs in their multi-BSSID profiles, potentially + * with hidden SSID mixed in ... ignore it. + */ + if (!list_empty(&nontrans_bss->nontrans_list)) + return -EINVAL; + /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; @@ -1094,6 +1135,23 @@ struct cfg80211_non_tx_bss { u8 bssid_index; }; +static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, + const struct cfg80211_bss_ies *new_ies, + const struct cfg80211_bss_ies *old_ies) +{ + struct cfg80211_internal_bss *bss; + + /* Assign beacon IEs to all sub entries */ + list_for_each_entry(bss, &known->hidden_list, hidden_list) { + const struct cfg80211_bss_ies *ies; + + ies = rcu_access_pointer(bss->pub.beacon_ies); + WARN_ON(ies != old_ies); + + rcu_assign_pointer(bss->pub.beacon_ies, new_ies); + } +} + static bool cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, @@ -1117,7 +1175,6 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } else if (rcu_access_pointer(new->pub.beacon_ies)) { const struct cfg80211_bss_ies *old; - struct cfg80211_internal_bss *bss; if (known->pub.hidden_beacon_bss && !list_empty(&known->hidden_list)) { @@ -1145,16 +1202,9 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, if (old == rcu_access_pointer(known->pub.ies)) rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); - /* Assign beacon IEs to all sub entries */ - list_for_each_entry(bss, &known->hidden_list, hidden_list) { - const struct cfg80211_bss_ies *ies; - - ies = rcu_access_pointer(bss->pub.beacon_ies); - WARN_ON(ies != old); - - rcu_assign_pointer(bss->pub.beacon_ies, - new->pub.beacon_ies); - } + cfg80211_update_hidden_bsses(known, + rcu_access_pointer(new->pub.beacon_ies), + old); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); @@ -1231,6 +1281,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, new->refcount = 1; INIT_LIST_HEAD(&new->hidden_list); INIT_LIST_HEAD(&new->pub.nontrans_list); + /* we'll set this later if it was non-NULL */ + new->pub.transmitted_bss = NULL; if (rcu_access_pointer(tmp->pub.proberesp_ies)) { hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN); @@ -1242,8 +1294,12 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, list_add(&new->hidden_list, &hidden->hidden_list); hidden->refcount++; + + ies = (void *)rcu_access_pointer(new->pub.beacon_ies); rcu_assign_pointer(new->pub.beacon_ies, hidden->pub.beacon_ies); + if (ies) + kfree_rcu(ies, rcu_head); } } else { /* @@ -1460,10 +1516,15 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, spin_lock_bh(&rdev->bss_lock); if (cfg80211_add_nontrans_list(non_tx_data->tx_bss, &res->pub)) { - if (__cfg80211_unlink_bss(rdev, res)) + if (__cfg80211_unlink_bss(rdev, res)) { rdev->bss_generation++; + res = NULL; + } } spin_unlock_bh(&rdev->bss_lock); + + if (!res) + return NULL; } trace_cfg80211_return_bss(&res->pub); @@ -1582,6 +1643,8 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, ie, ielen) { if (elem->datalen < 4) continue; + if (elem->data[0] < 1 || (int)elem->data[0] > 8) + continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 profile_len; @@ -1635,7 +1698,7 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, new_ie_len = cfg80211_gen_new_ie(ie, ielen, profile, profile_len, new_ie, - gfp); + IEEE80211_MAX_DATA_LEN); if (!new_ie_len) continue; @@ -1717,7 +1780,7 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, size_t new_ie_len; struct cfg80211_bss_ies *new_ies; const struct cfg80211_bss_ies *old; - u8 cpy_len; + size_t cpy_len; lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock); @@ -1784,6 +1847,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, } else { old = rcu_access_pointer(nontrans_bss->beacon_ies); rcu_assign_pointer(nontrans_bss->beacon_ies, new_ies); + cfg80211_update_hidden_bsses(bss_from_pub(nontrans_bss), + new_ies, old); rcu_assign_pointer(nontrans_bss->ies, new_ies); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 63f89687a018..a260cd60a7b9 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -269,6 +269,15 @@ void cfg80211_conn_work(struct work_struct *work) rtnl_unlock(); } +static void cfg80211_step_auth_next(struct cfg80211_conn *conn, + struct cfg80211_bss *bss) +{ + memcpy(conn->bssid, bss->bssid, ETH_ALEN); + conn->params.bssid = conn->bssid; + conn->params.channel = bss->channel; + conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; +} + /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { @@ -286,10 +295,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) if (!bss) return NULL; - memcpy(wdev->conn->bssid, bss->bssid, ETH_ALEN); - wdev->conn->params.bssid = wdev->conn->bssid; - wdev->conn->params.channel = bss->channel; - wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; + cfg80211_step_auth_next(wdev->conn, bss); schedule_work(&rdev->conn_work); return bss; @@ -568,7 +574,12 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, wdev->conn->params.ssid_len = wdev->ssid_len; /* see if we have the bss already */ - bss = cfg80211_get_conn_bss(wdev); + bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, + wdev->conn->params.bssid, + wdev->conn->params.ssid, + wdev->conn->params.ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (prev_bssid) { memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); @@ -579,6 +590,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, if (bss) { enum nl80211_timeout_reason treason; + cfg80211_step_auth_next(wdev->conn, bss); err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { @@ -1233,6 +1245,13 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, } else { if (WARN_ON(connkeys)) return -EINVAL; + + /* connect can point to wdev->wext.connect which + * can hold key data from a previous connection + */ + connect->key = NULL; + connect->key_len = 0; + connect->key_idx = 0; } wdev->connect_keys = connkeys; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 76a80a41615b..a57f54bc0e1a 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -796,6 +796,12 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd, } } + /* Sanity-check to ensure we never end up _allocating_ zero + * bytes of data for extra. + */ + if (extra_size <= 0) + return -EFAULT; + /* kzalloc() ensures NULL-termination for essid_compat. */ extra = kzalloc(extra_size, GFP_KERNEL); if (!extra) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c94aa587e0c9..851096110b4d 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -470,12 +470,12 @@ static int x25_getsockopt(struct socket *sock, int level, int optname, if (get_user(len, optlen)) goto out; - len = min_t(unsigned int, len, sizeof(int)); - rc = -EINVAL; if (len < 0) goto out; + len = min_t(unsigned int, len, sizeof(int)); + rc = -EFAULT; if (put_user(len, optlen)) goto out; @@ -492,6 +492,12 @@ static int x25_listen(struct socket *sock, int backlog) int rc = -EOPNOTSUPP; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + rc = -EINVAL; + release_sock(sk); + return rc; + } + if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 25bf72ee6cad..226397add422 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -117,7 +117,7 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, if (!pskb_may_pull(skb, 1)) { x25_neigh_put(nb); - return 0; + goto drop; } switch (skb->data[0]) { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 2bc0d6e3e124..d04a2345bc3f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -613,6 +613,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; + int bound_dev_if; u32 flags, qid; int err = 0; @@ -626,6 +627,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) XDP_USE_NEED_WAKEUP)) return -EINVAL; + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) + return -EINVAL; + rtnl_lock(); mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index fbc4552d17b8..6e5e307f985e 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -3,6 +3,8 @@ # Makefile for the XFRM subsystem. # +xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o + obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o xfrm_device.o diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index e120df0a6da1..4d8d7cf3d199 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -274,8 +274,7 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), - ipipv6_hdr(skb)); + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface_core.c index 4cfa79e04e3d..82c0c0575074 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface_core.c @@ -219,8 +219,8 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) skb->dev = dev; if (err) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_dropped); return 0; } @@ -260,7 +260,6 @@ static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; @@ -286,7 +285,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) tdev = dst->dev; if (tdev == dev) { - stats->collisions++; + DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; @@ -329,13 +328,13 @@ xmit: tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { - stats->tx_errors++; - stats->tx_aborted_errors++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); @@ -345,7 +344,6 @@ tx_err_dst_release: static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; @@ -354,23 +352,23 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->protocol) { case htons(ETH_P_IPV6): - xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + xfrm_decode_session(skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + xfrm_decode_session(skb, &fl, AF_INET); if (!dst) { struct rtable *rt; @@ -378,7 +376,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); @@ -397,8 +395,8 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_err: - stats->tx_errors++; - stats->tx_dropped++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 4d422447aadc..4fca4b6cec8b 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -212,6 +212,7 @@ static void ipcomp_free_scratches(void) vfree(*per_cpu_ptr(scratches, i)); free_percpu(scratches); + ipcomp_scratches = NULL; } static void * __percpu *ipcomp_alloc_scratches(void) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 28a8cdef8e51..9484f27e905a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1369,8 +1369,6 @@ EXPORT_SYMBOL(xfrm_policy_hash_rebuild); * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { - static u32 idx_generator; - for (;;) { struct hlist_head *list; struct xfrm_policy *p; @@ -1378,8 +1376,8 @@ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) int found; if (!index) { - idx = (idx_generator | dir); - idx_generator += 8; + idx = (net->xfrm.idx_generator | dir); + net->xfrm.idx_generator += 8; } else { idx = index; index = 0; @@ -3223,7 +3221,7 @@ xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, - unsigned short family) + unsigned short family, u32 if_id) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); @@ -3234,7 +3232,8 @@ xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && - xfrm_state_addr_cmp(tmpl, x, family)); + xfrm_state_addr_cmp(tmpl, x, family)) && + (if_id == 0 || if_id == x->if_id); } /* @@ -3246,7 +3245,7 @@ xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, - unsigned short family) + unsigned short family, u32 if_id) { int idx = start; @@ -3256,7 +3255,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star } else start = -1; for (; idx < sp->len; idx++) { - if (xfrm_state_ok(tmpl, sp->xvec[idx], family)) + if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (start == -1) @@ -3619,6 +3618,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); + xfrm_pol_put(pols[0]); return 0; } pols[1]->curlft.use_time = ktime_get_real_seconds(); @@ -3665,7 +3665,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * are implied between each two transformations. */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { - k = xfrm_policy_ok(tpp[i], sp, k, family); + k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index bee1a8143d75..e8be18bff096 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2511,9 +2511,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) if (inner_mode == NULL) goto error; - if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) - goto error; - x->inner_mode = *inner_mode; if (x->props.family == AF_INET) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index bd44a800e7db..f76033d1898a 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -522,7 +522,7 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; - if (re) { + if (re && x->replay_esn && x->preplay_esn) { struct xfrm_replay_state_esn *replay_esn; replay_esn = nla_data(re); memcpy(x->replay_esn, replay_esn, @@ -1037,6 +1037,15 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) sizeof(*filter), GFP_KERNEL); if (filter == NULL) return -ENOMEM; + + /* see addr_match(), (prefix length >> 5) << 2 + * will be used to compare xfrm_address_t + */ + if (filter->splen > (sizeof(xfrm_address_t) << 3) || + filter->dplen > (sizeof(xfrm_address_t) << 3)) { + kfree(filter); + return -EINVAL; + } } if (attrs[XFRMA_PROTO]) @@ -1695,6 +1704,9 @@ static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) if (xp->xfrm_nr == 0) return 0; + if (xp->xfrm_nr > XFRM_MAX_DEPTH) + return -ENOBUFS; + for (i = 0; i < xp->xfrm_nr; i++) { struct xfrm_user_tmpl *up = &vec[i]; struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; @@ -2574,7 +2586,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) }, [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) }, - [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_user_sec_ctx) }, [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, |