diff options
Diffstat (limited to 'net/netfilter')
48 files changed, 1023 insertions, 468 deletions
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 8acc4e173167..ecb3d67b41bf 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -61,9 +61,9 @@ mtype_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - ip_set_free(map->members); if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); + ip_set_free(map->members); ip_set_free(map); set->data = NULL; @@ -76,7 +76,7 @@ mtype_flush(struct ip_set *set) if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); - memset(map->members, 0, map->memsize); + bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e3884b0cca91..79d1a8e554dc 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -37,7 +37,7 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -220,7 +220,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, u32 first_ip, u32 last_ip, u32 elements, u32 hosts, u8 netmask) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -310,11 +310,11 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index cfe7b556775f..0aac330ef78b 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -43,7 +43,7 @@ enum { /* Type structure */ struct bitmap_ipmac { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -300,7 +300,7 @@ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -361,10 +361,10 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index d8c140553379..044310ccf0a4 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -31,7 +31,7 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -205,7 +205,7 @@ static bool init_map_port(struct ip_set *set, struct bitmap_port *map, u16 first_port, u16 last_port) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_port = first_port; @@ -245,10 +245,10 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -ENOMEM; map->elements = elements; - map->memsize = bitmap_bytes(0, map->elements); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e103c875383a..cb1930fd803d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -296,7 +296,8 @@ ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; @@ -314,7 +315,8 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; @@ -380,6 +382,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; + if (align < ip_set_extensions[id].align) + align = ip_set_extensions[id].align; len = ALIGN(len, ip_set_extensions[id].align); set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; @@ -555,6 +559,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index) return set; } +static inline void +ip_set_lock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_lock_bh(&set->lock); +} + +static inline void +ip_set_unlock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_unlock_bh(&set->lock); +} + int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) @@ -576,9 +594,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -607,9 +625,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -629,9 +647,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -934,7 +952,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && - nla_parse_nested_deprecated(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { + nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], + set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } @@ -1095,9 +1114,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->flush(set); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); } static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, @@ -1281,30 +1300,43 @@ dump_attrs(struct nlmsghdr *nlh) } } +static const struct nla_policy +ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_FLAGS] = { .type = NLA_U32 }, +}; + static int -dump_init(struct netlink_callback *cb, struct ip_set_net *inst) +ip_set_dump_start(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; + struct sk_buff *skb = cb->skb; + struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type; - ip_set_id_t index; + int ret; - /* Second pass, so parser can't fail */ - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, ip_set_setname_policy, - NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_dump_policy, NULL); + if (ret) + goto error; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { + ip_set_id_t index; struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); - if (!set) - return -ENOENT; - + if (!set) { + ret = -ENOENT; + goto error; + } dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; } else { @@ -1320,10 +1352,17 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) cb->args[IPSET_CB_DUMP] = dump_type; return 0; + +error: + /* We have to create and send the error message manually :-( */ + if (nlh->nlmsg_flags & NLM_F_ACK) { + netlink_ack(cb->skb, nlh, ret, NULL); + } + return ret; } static int -ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) +ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) { ip_set_id_t index = IPSET_INVALID_ID, max; struct ip_set *set = NULL; @@ -1334,18 +1373,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) bool is_destroyed; int ret = 0; - if (!cb->args[IPSET_CB_DUMP]) { - ret = dump_init(cb, inst); - if (ret < 0) { - nlh = nlmsg_hdr(cb->skb); - /* We have to create and send the error message - * manually :-( - */ - if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(cb->skb, nlh, ret, NULL); - return ret; - } - } + if (!cb->args[IPSET_CB_DUMP]) + return -EINVAL; if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) goto out; @@ -1481,7 +1510,8 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, { struct netlink_dump_control c = { - .dump = ip_set_dump_start, + .start = ip_set_dump_start, + .dump = ip_set_dump_do, .done = ip_set_dump_done, }; return netlink_dump_start(ctnl, skb, nlh, &c); @@ -1509,9 +1539,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); retried = true; } while (ret == -EAGAIN && set->variant->resize && @@ -1541,9 +1571,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, ip_set_adt_policy, + NULL); if (ret) { nlmsg_free(skb2); @@ -1592,7 +1622,9 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, use_lineno); @@ -1603,7 +1635,8 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, memset(tb, 0, sizeof(tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, use_lineno); @@ -1675,6 +1708,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; + u32 lineno; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || @@ -1686,11 +1720,12 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!set) return -ENOENT; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); - ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); + ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) @@ -1992,7 +2027,7 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_LIST] = { .call = ip_set_dump, .attr_count = IPSET_ATTR_CMD_MAX, - .policy = ip_set_setname_policy, + .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, @@ -2100,8 +2135,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } req_version->version = IPSET_PROTOCOL; - ret = copy_to_user(user, req_version, - sizeof(struct ip_set_req_version)); + if (copy_to_user(user, req_version, + sizeof(struct ip_set_req_version))) + ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { @@ -2160,7 +2196,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } /* end of switch(op) */ copy: - ret = copy_to_user(user, data, copylen); + if (copy_to_user(user, data, copylen)) + ret = -EFAULT; done: vfree(data); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 175f8fedcfaf..f970ff99f61b 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -8,13 +8,21 @@ #include <linux/rcupdate.h> #include <linux/jhash.h> #include <linux/types.h> +#include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/ipset/ip_set_timeout.h> -#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) -#define ipset_dereference_protected(p, set) \ - __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock)) - -#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) +#define __ipset_dereference(p) \ + rcu_dereference_protected(p, 1) +#define ipset_dereference_nfnl(p) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) +#define ipset_dereference_set(p, set) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&(set)->lock)) +#define ipset_dereference_bh_nfnl(p) \ + rcu_dereference_bh_check(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. @@ -73,11 +81,35 @@ struct hbucket { __aligned(__alignof__(u64)); }; +/* Region size for locking == 2^HTABLE_REGION_BITS */ +#define HTABLE_REGION_BITS 10 +#define ahash_numof_locks(htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 1 \ + : jhash_size((htable_bits) - HTABLE_REGION_BITS)) +#define ahash_sizeof_regions(htable_bits) \ + (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) +#define ahash_region(n, htable_bits) \ + ((n) % ahash_numof_locks(htable_bits)) +#define ahash_bucket_start(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 0 \ + : (h) * jhash_size(HTABLE_REGION_BITS)) +#define ahash_bucket_end(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \ + : ((h) + 1) * jhash_size(HTABLE_REGION_BITS)) + +struct htable_gc { + struct delayed_work dwork; + struct ip_set *set; /* Set the gc belongs to */ + u32 region; /* Last gc run position */ +}; + /* The hash table: the table size stored here in order to make resizing easy */ struct htable { atomic_t ref; /* References for resizing */ - atomic_t uref; /* References for dumping */ + atomic_t uref; /* References for dumping and gc */ u8 htable_bits; /* size of hash table == 2^htable_bits */ + u32 maxelem; /* Maxelem per region */ + struct ip_set_region *hregion; /* Region locks and ext sizes */ struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; @@ -163,6 +195,10 @@ htable_bits(u32 hashsize) #define NLEN 0 #endif /* IP_SET_HASH_WITH_NETS */ +#define SET_ELEM_EXPIRED(set, d) \ + (SET_WITH_TIMEOUT(set) && \ + ip_set_timeout_expired(ext_timeout(d, set))) + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -206,10 +242,12 @@ htable_bits(u32 hashsize) #undef mtype_test_cidrs #undef mtype_test #undef mtype_uref -#undef mtype_expire #undef mtype_resize +#undef mtype_ext_size +#undef mtype_resize_ad #undef mtype_head #undef mtype_list +#undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init #undef mtype_variant @@ -248,10 +286,12 @@ htable_bits(u32 hashsize) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_uref IPSET_TOKEN(MTYPE, _uref) -#define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) +#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size) +#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) @@ -276,8 +316,7 @@ htable_bits(u32 hashsize) /* The generic hash structure */ struct htype { struct htable __rcu *table; /* the hash table */ - struct timer_list gc; /* garbage collection when timeout enabled */ - struct ip_set *set; /* attached to this ip_set */ + struct htable_gc gc; /* gc workqueue */ u32 maxelem; /* max elements in the hash */ u32 initval; /* random jhash init value */ #ifdef IP_SET_HASH_WITH_MARKMASK @@ -289,21 +328,33 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif + struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */ #endif }; +/* ADD|DEL entries saved during resize */ +struct mtype_resize_ad { + struct list_head list; + enum ipset_adt ad; /* ADD|DEL element */ + struct mtype_elem d; /* Element value */ + struct ip_set_ext ext; /* Extensions for ADD */ + struct ip_set_ext mext; /* Target extensions for ADD */ + u32 flags; /* Flags for ADD */ +}; + #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different * sized networks. cidr == real cidr + 1 to support /0. */ static void -mtype_add_cidr(struct htype *h, u8 cidr, u8 n) +mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { int i, j; + spin_lock_bh(&set->lock); /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) { if (j != -1) { @@ -312,7 +363,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) j = i; } else if (h->nets[i].cidr[n] == cidr) { h->nets[CIDR_POS(cidr)].nets[n]++; - return; + goto unlock; } } if (j != -1) { @@ -321,24 +372,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) } h->nets[i].cidr[n] = cidr; h->nets[CIDR_POS(cidr)].nets[n] = 1; +unlock: + spin_unlock_bh(&set->lock); } static void -mtype_del_cidr(struct htype *h, u8 cidr, u8 n) +mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { u8 i, j, net_end = NLEN - 1; + spin_lock_bh(&set->lock); for (i = 0; i < NLEN; i++) { if (h->nets[i].cidr[n] != cidr) continue; h->nets[CIDR_POS(cidr)].nets[n]--; if (h->nets[CIDR_POS(cidr)].nets[n] > 0) - return; + goto unlock; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + goto unlock; } +unlock: + spin_unlock_bh(&set->lock); } #endif @@ -346,7 +402,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n) static size_t mtype_ahash_memsize(const struct htype *h, const struct htable *t) { - return sizeof(*h) + sizeof(*t); + return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits); } /* Get the ith element from the array block n */ @@ -370,24 +426,29 @@ mtype_flush(struct ip_set *set) struct htype *h = set->data; struct htable *t; struct hbucket *n; - u32 i; - - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); - if (!n) - continue; - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - rcu_assign_pointer(hbucket(t, i), NULL); - kfree_rcu(n, rcu); + u32 r, i; + + t = ipset_dereference_nfnl(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + } + t->hregion[r].ext_size = 0; + t->hregion[r].elements = 0; + spin_unlock_bh(&t->hregion[r].lock); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(h->nets)); #endif - set->elements = 0; - set->ext_size = 0; } /* Destroy the hashtable part of the set */ @@ -398,7 +459,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -407,6 +468,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) kfree(n); } + ip_set_free(t->hregion); ip_set_free(t); } @@ -415,28 +477,21 @@ static void mtype_destroy(struct ip_set *set) { struct htype *h = set->data; + struct list_head *l, *lt; if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&h->gc); + cancel_delayed_work_sync(&h->gc.dwork); - mtype_ahash_destroy(set, - __ipset_dereference_protected(h->table, 1), true); + mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + list_for_each_safe(l, lt, &h->ad) { + list_del(l); + kfree(l); + } kfree(h); set->data = NULL; } -static void -mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) -{ - struct htype *h = set->data; - - timer_setup(&h->gc, gc, 0); - mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); - pr_debug("gc initialized, run in every %u\n", - IPSET_GC_PERIOD(set->timeout)); -} - static bool mtype_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -455,11 +510,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } -/* Delete expired elements from the hashtable */ static void -mtype_expire(struct ip_set *set, struct htype *h) +mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) { - struct htable *t; struct hbucket *n, *tmp; struct mtype_elem *data; u32 i, j, d; @@ -467,10 +520,12 @@ mtype_expire(struct ip_set *set, struct htype *h) #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif + u8 htable_bits = t->htable_bits; - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, htable_bits); + i < ahash_bucket_end(r, htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; for (j = 0, d = 0; j < n->pos; j++) { @@ -486,58 +541,100 @@ mtype_expire(struct ip_set *set, struct htype *h) smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, k)), k); #endif + t->hregion[r].elements--; ip_set_ext_destroy(set, data); - set->elements--; d++; } if (d >= AHASH_INIT_SIZE) { if (d >= n->size) { + t->hregion[r].ext_size -= + ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, i), NULL); kfree_rcu(n, rcu); continue; } tmp = kzalloc(sizeof(*tmp) + - (n->size - AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) - /* Still try to delete expired elements */ + /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; for (j = 0, d = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); - memcpy(tmp->value + d * dsize, data, dsize); + memcpy(tmp->value + d * dsize, + data, dsize); set_bit(d, tmp->used); d++; } tmp->pos = d; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, i), tmp); kfree_rcu(n, rcu); } } + spin_unlock_bh(&t->hregion[r].lock); } static void -mtype_gc(struct timer_list *t) +mtype_gc(struct work_struct *work) { - struct htype *h = from_timer(h, t, gc); - struct ip_set *set = h->set; + struct htable_gc *gc; + struct ip_set *set; + struct htype *h; + struct htable *t; + u32 r, numof_locks; + unsigned int next_run; + + gc = container_of(work, struct htable_gc, dwork.work); + set = gc->set; + h = set->data; - pr_debug("called\n"); spin_lock_bh(&set->lock); - mtype_expire(set, h); + t = ipset_dereference_set(h->table, set); + atomic_inc(&t->uref); + numof_locks = ahash_numof_locks(t->htable_bits); + r = gc->region++; + if (r >= numof_locks) { + r = gc->region = 0; + } + next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks; + if (next_run < HZ/10) + next_run = HZ/10; spin_unlock_bh(&set->lock); - h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; - add_timer(&h->gc); + mtype_gc_do(set, h, t, r); + + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by expire: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + + queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run); + +} + +static void +mtype_gc_init(struct htable_gc *gc) +{ + INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc); + queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); + /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or * fail due to memory pressures. @@ -548,7 +645,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t extsize, dsize = set->dsize; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -556,7 +653,9 @@ mtype_resize(struct ip_set *set, bool retried) struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j, key; + struct list_head *l, *lt; + struct mtype_resize_ad *x; + u32 i, j, r, nr, key; int ret; #ifdef IP_SET_HASH_WITH_NETS @@ -564,10 +663,8 @@ mtype_resize(struct ip_set *set, bool retried) if (!tmp) return -ENOMEM; #endif - rcu_read_lock_bh(); - orig = rcu_dereference_bh_nfnl(h->table); + orig = ipset_dereference_bh_nfnl(h->table); htable_bits = orig->htable_bits; - rcu_read_unlock_bh(); retry: ret = 0; @@ -584,88 +681,124 @@ retry: ret = -ENOMEM; goto out; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); + if (!t->hregion) { + ip_set_free(t); + ret = -ENOMEM; + goto out; + } t->htable_bits = htable_bits; + t->maxelem = h->maxelem / ahash_numof_locks(htable_bits); + for (i = 0; i < ahash_numof_locks(htable_bits); i++) + spin_lock_init(&t->hregion[i].lock); - spin_lock_bh(&set->lock); - orig = __ipset_dereference_protected(h->table, 1); - /* There can't be another parallel resizing, but dumping is possible */ + /* There can't be another parallel resizing, + * but dumping, gc, kernel side add/del are possible + */ + orig = ipset_dereference_bh_nfnl(h->table); atomic_set(&orig->ref, 1); atomic_inc(&orig->uref); - extsize = 0; pr_debug("attempt to resize set %s from %u to %u, t %p\n", set->name, orig->htable_bits, htable_bits, orig); - for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(orig, i), 1); - if (!n) - continue; - for (j = 0; j < n->pos; j++) { - if (!test_bit(j, n->used)) + for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) { + /* Expire may replace a hbucket with another one */ + rcu_read_lock_bh(); + for (i = ahash_bucket_start(r, orig->htable_bits); + i < ahash_bucket_end(r, orig->htable_bits); i++) { + n = __ipset_dereference(hbucket(orig, i)); + if (!n) continue; - data = ahash_data(n, j, dsize); + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + if (SET_ELEM_EXPIRED(set, data)) + continue; #ifdef IP_SET_HASH_WITH_NETS - /* We have readers running parallel with us, - * so the live data cannot be modified. - */ - flags = 0; - memcpy(tmp, data, dsize); - data = tmp; - mtype_data_reset_flags(data, &flags); + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ + flags = 0; + memcpy(tmp, data, dsize); + data = tmp; + mtype_data_reset_flags(data, &flags); #endif - key = HKEY(data, h->initval, htable_bits); - m = __ipset_dereference_protected(hbucket(t, key), 1); - if (!m) { - m = kzalloc(sizeof(*m) + + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference(hbucket(t, key)); + nr = ahash_region(key, htable_bits); + if (!m) { + m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, GFP_ATOMIC); - if (!m) { - ret = -ENOMEM; - goto cleanup; - } - m->size = AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - RCU_INIT_POINTER(hbucket(t, key), m); - } else if (m->pos >= m->size) { - struct hbucket *ht; - - if (m->size >= AHASH_MAX(h)) { - ret = -EAGAIN; - } else { - ht = kzalloc(sizeof(*ht) + + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + (m->size + AHASH_INIT_SIZE) * dsize, GFP_ATOMIC); - if (!ht) - ret = -ENOMEM; + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - if (ret < 0) - goto cleanup; - memcpy(ht, m, sizeof(struct hbucket) + - m->size * dsize); - ht->size = m->size + AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - kfree(m); - m = ht; - RCU_INIT_POINTER(hbucket(t, key), ht); - } - d = ahash_data(m, m->pos, dsize); - memcpy(d, data, dsize); - set_bit(m->pos++, m->used); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); + t->hregion[nr].elements++; #ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(d, &flags); + mtype_data_reset_flags(d, &flags); #endif + } } + rcu_read_unlock_bh(); } - rcu_assign_pointer(h->table, t); - set->ext_size = extsize; - spin_unlock_bh(&set->lock); + /* There can't be any other writer. */ + rcu_assign_pointer(h->table, t); /* Give time to other readers of the set */ synchronize_rcu(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - /* If there's nobody else dumping the table, destroy it */ + /* Add/delete elements processed by the SET target during resize. + * Kernel-side add cannot trigger a resize and userspace actions + * are serialized by the mutex. + */ + list_for_each_safe(l, lt, &h->ad) { + x = list_entry(l, struct mtype_resize_ad, list); + if (x->ad == IPSET_ADD) { + mtype_add(set, &x->d, &x->ext, &x->mext, x->flags); + } else { + mtype_del(set, &x->d, NULL, NULL, 0); + } + list_del(l); + kfree(l); + } + /* If there's nobody else using the table, destroy it */ if (atomic_dec_and_test(&orig->uref)) { pr_debug("Table destroy by resize %p\n", orig); mtype_ahash_destroy(set, orig, false); @@ -678,15 +811,44 @@ out: return ret; cleanup: + rcu_read_unlock_bh(); atomic_set(&orig->ref, 0); atomic_dec(&orig->uref); - spin_unlock_bh(&set->lock); mtype_ahash_destroy(set, t, false); if (ret == -EAGAIN) goto retry; goto out; } +/* Get the current number of elements and ext_size in the set */ +static void +mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) +{ + struct htype *h = set->data; + const struct htable *t; + u32 i, j, r; + struct hbucket *n; + struct mtype_elem *data; + + t = rcu_dereference_bh(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, set->dsize); + if (!SET_ELEM_EXPIRED(set, data)) + (*elements)++; + } + } + *ext_size += t->hregion[r].ext_size; + } +} + /* Add an element to a hash and update the internal counters when succeeded, * otherwise report the proper error code. */ @@ -699,32 +861,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n, *old = ERR_PTR(-ENOENT); - int i, j = -1; + int i, j = -1, ret; bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; - u32 key, multi = 0; + u32 r, key, multi = 0, elements, maxelem; - if (set->elements >= h->maxelem) { - if (SET_WITH_TIMEOUT(set)) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h); - if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + elements = t->hregion[r].elements; + maxelem = t->maxelem; + if (elements >= maxelem) { + u32 e; + if (SET_WITH_TIMEOUT(set)) { + rcu_read_unlock_bh(); + mtype_gc_do(set, h, t, r); + rcu_read_lock_bh(); + } + maxelem = h->maxelem; + elements = 0; + for (e = 0; e < ahash_numof_locks(t->htable_bits); e++) + elements += t->hregion[e].elements; + if (elements >= maxelem && SET_WITH_FORCEADD(set)) forceadd = true; } + rcu_read_unlock_bh(); - t = ipset_dereference_protected(h->table, set); - key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) { - if (forceadd || set->elements >= h->maxelem) + if (forceadd || elements >= maxelem) goto set_full; old = NULL; n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } n->size = AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } for (i = 0; i < n->pos; i++) { @@ -738,38 +917,37 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { - if (flag_exist || - (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)))) { + if (flag_exist || SET_ELEM_EXPIRED(set, data)) { /* Just the extensions could be overwritten */ j = i; goto overwrite_extensions; } - return -IPSET_ERR_EXIST; + ret = -IPSET_ERR_EXIST; + goto unlock; } /* Reuse first timed out entry */ - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)) && - j == -1) { + if (SET_ELEM_EXPIRED(set, data) && j == -1) { j = i; reuse = true; } } if (reuse || forceadd) { + if (j == -1) + j = 0; data = ahash_data(n, j, set->dsize); if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, i)), i); #endif ip_set_ext_destroy(set, data); - set->elements--; + t->hregion[r].elements--; } goto copy_data; } - if (set->elements >= h->maxelem) + if (elements >= maxelem) goto set_full; /* Create a new slot */ if (n->pos >= n->size) { @@ -777,28 +955,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); - return -EAGAIN; + ret = -EAGAIN; + goto resize; } old = n; n = kzalloc(sizeof(*n) + (old->size + AHASH_INIT_SIZE) * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } memcpy(n, old, sizeof(struct hbucket) + old->size * set->dsize); n->size = old->size + AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); } copy_elem: j = n->pos++; data = ahash_data(n, j, set->dsize); copy_data: - set->elements++; + t->hregion[r].elements++; #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); + mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); #endif memcpy(data, d, sizeof(struct mtype_elem)); overwrite_extensions: @@ -821,13 +1003,41 @@ overwrite_extensions: if (old) kfree_rcu(old, rcu); } + ret = 0; +resize: + spin_unlock_bh(&t->hregion[r].lock); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side add, save values */ + struct mtype_resize_ad *x; + + x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); + if (!x) + /* Don't bother */ + goto out; + x->ad = IPSET_ADD; + memcpy(&x->d, value, sizeof(struct mtype_elem)); + memcpy(&x->ext, ext, sizeof(struct ip_set_ext)); + memcpy(&x->mext, mext, sizeof(struct ip_set_ext)); + x->flags = flags; + spin_lock_bh(&set->lock); + list_add_tail(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + goto out; - return 0; set_full: if (net_ratelimit()) pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - return -IPSET_ERR_HASH_FULL; + set->name, maxelem); + ret = -IPSET_ERR_HASH_FULL; +unlock: + spin_unlock_bh(&t->hregion[r].lock); +out: + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by add: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + return ret; } /* Delete an element from the hash and free up space if possible. @@ -841,13 +1051,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, j, k, ret = -IPSET_ERR_EXIST; + struct mtype_resize_ad *x = NULL; + int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; - t = ipset_dereference_protected(h->table, set); + /* Userspace add and resize is excluded by the mutex. + * Kernespace add does not trigger resize. + */ + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + rcu_read_unlock_bh(); + + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; for (i = 0, k = 0; i < n->pos; i++) { @@ -858,8 +1078,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set))) + if (SET_ELEM_EXPIRED(set, data)) goto out; ret = 0; @@ -867,20 +1086,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, smp_mb__after_atomic(); if (i + 1 == n->pos) n->pos--; - set->elements--; + t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), - j); + mtype_del_cidr(set, h, + NCIDR_PUT(DCIDR_GET(d->cidr, j)), j); #endif ip_set_ext_destroy(set, data); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side del, + * save values + */ + x = kzalloc(sizeof(struct mtype_resize_ad), + GFP_ATOMIC); + if (x) { + x->ad = IPSET_DEL; + memcpy(&x->d, value, + sizeof(struct mtype_elem)); + x->flags = flags; + } + } for (; i < n->pos; i++) { if (!test_bit(i, n->used)) k++; } if (n->pos == 0 && k == 0) { - set->ext_size -= ext_size(n->size, dsize); + t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); } else if (k >= AHASH_INIT_SIZE) { @@ -899,7 +1131,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, k++; } tmp->pos = k; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, key), tmp); kfree_rcu(n, rcu); } @@ -907,6 +1140,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } out: + spin_unlock_bh(&t->hregion[r].lock); + if (x) { + spin_lock_bh(&set->lock); + list_add(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by del: %p\n", t); + mtype_ahash_destroy(set, t, false); + } return ret; } @@ -992,6 +1235,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; + rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, @@ -1023,6 +1267,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto out; } out: + rcu_read_unlock_bh(); return ret; } @@ -1034,23 +1279,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u32 elements = 0; + size_t ext_size = 0; u8 htable_bits; - /* If any members have expired, set->elements will be wrong - * mytype_expire function will update it with the right count. - * we do not hold set->lock here, so grab it first. - * set->elements can still be incorrect in the case of a huge set, - * because elements might time out during the listing. - */ - if (SET_WITH_TIMEOUT(set)) { - spin_lock_bh(&set->lock); - mtype_expire(set, h); - spin_unlock_bh(&set->lock); - } - rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); - memsize = mtype_ahash_memsize(h, t) + set->ext_size; + t = rcu_dereference_bh(h->table); + mtype_ext_size(set, &elements, &ext_size); + memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size; htable_bits = t->htable_bits; rcu_read_unlock_bh(); @@ -1072,7 +1308,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) #endif if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || - nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) + nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -1092,15 +1328,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) if (start) { rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_bh_nfnl(h->table); atomic_inc(&t->uref); cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; rcu_read_unlock_bh(); } else if (cb->args[IPSET_CB_PRIVATE]) { t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { - /* Resizing didn't destroy the hash table */ - pr_debug("Table destroy by dump: %p\n", t); + pr_debug("Table destroy after resize " + " by dump: %p\n", t); mtype_ahash_destroy(set, t, false); } cb->args[IPSET_CB_PRIVATE] = 0; @@ -1142,8 +1378,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + if (SET_ELEM_EXPIRED(set, e)) continue; pr_debug("list hash %lu hbucket %p i %u, data %p\n", cb->args[IPSET_CB_ARG0], n, i, e); @@ -1209,6 +1444,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .region_lock = true, }; #ifdef IP_SET_EMIT_CREATE @@ -1227,6 +1463,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, size_t hsize; struct htype *h; struct htable *t; + u32 i; pr_debug("Create set %s with family %s\n", set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); @@ -1295,6 +1532,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, kfree(h); return -ENOMEM; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); + if (!t->hregion) { + ip_set_free(t); + kfree(h); + return -ENOMEM; + } + h->gc.set = set; + for (i = 0; i < ahash_numof_locks(hbits); i++) + spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; #ifdef IP_SET_HASH_WITH_NETMASK h->netmask = netmask; @@ -1305,9 +1551,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, get_random_bytes(&h->initval, sizeof(h->initval)); t->htable_bits = hbits; + t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); - h->set = set; + INIT_LIST_HEAD(&h->ad); set->data = h; #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { @@ -1330,12 +1577,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) #endif - IPSET_TOKEN(HTYPE, 4_gc_init)(set, - IPSET_TOKEN(HTYPE, 4_gc)); + IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc); #ifndef IP_SET_PROTO_UNDEF else - IPSET_TOKEN(HTYPE, 6_gc_init)(set, - IPSET_TOKEN(HTYPE, 6_gc)); + IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc); #endif } pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 24d8f4df4230..4ce563eb927d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -209,7 +209,7 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - if (opt->flags & IPSET_DIM_ONE_SRC) + if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 06c91e49bf25..471e69928e8e 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -369,6 +369,7 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 832e4f5491cb..975ba1892ae4 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -476,6 +476,7 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 8ada318bf09d..eb9a8d97e65a 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -60,7 +60,7 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) - opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; + opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE; list_for_each_entry_rcu(e, &map->members, list) { ret = ip_set_test(e->id, skb, par, opt); if (ret <= 0) diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index bfd4365a8d73..fce55eacf583 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -193,21 +193,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app * mutex_lock(&__ip_vs_app_mutex); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) { + err = -ENOENT; + goto out_unlock; + } + list_for_each_entry(a, &ipvs->app_list, a_list) { if (!strcmp(app->name, a->name)) { err = -EEXIST; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } } a = kmemdup(app, sizeof(*app), GFP_KERNEL); if (!a) { err = -ENOMEM; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } INIT_LIST_HEAD(&a->incs_list); list_add(&a->a_list, &ipvs->app_list); - /* increase the module use count */ - ip_vs_use_count_inc(); out_unlock: mutex_unlock(&__ip_vs_app_mutex); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d5103a9eb302..c85752f8bd1a 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1955,14 +1955,14 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { - bool uses_ct = false, resched = false; + bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); } else if (is_new_conn_expected(cp, conn_reuse_mode)) { - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { @@ -1970,15 +1970,17 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int * that uses conntrack while it is still * referenced by controlled connection(s). */ - resched = !uses_ct; + resched = !old_ct; } } if (resched) { + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); - if (uses_ct) + if (old_ct) return NF_DROP; cp = NULL; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 741d91aa4a8d..2e3b59d79ffb 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -93,7 +93,6 @@ static bool __ip_vs_addr_is_local_v6(struct net *net, static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; - static int old_secure_tcp = 0; int availmem; int nomem; int to_change = -1; @@ -174,35 +173,35 @@ static void update_defense_level(struct netns_ipvs *ipvs) spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; break; } - old_secure_tcp = ipvs->sysctl_secure_tcp; + ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); @@ -1217,7 +1216,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service *svc = NULL; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { @@ -2377,9 +2377,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (copy_from_user(arg, user, len) != 0) return -EFAULT; - /* increase the module use count */ - ip_vs_use_count_inc(); - /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { @@ -2392,13 +2389,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) - goto out_dec; + return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } - goto out_dec; + return ret; } mutex_lock(&__ip_vs_mutex); @@ -2493,10 +2490,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) out_unlock: mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - return ret; } diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 8e104dff7abc..166c669f0763 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -68,7 +68,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) struct ip_vs_pe *tmp; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 2f9d5cd5daee..d4903723be7e 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -179,7 +179,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) } /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_sched_mutex); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index a4a78c4b06de..0c1bc654245c 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1717,6 +1717,8 @@ static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; struct netns_ipvs *ipvs = tinfo->ipvs; + struct sock *sk = tinfo->sock->sk; + struct udp_sock *up = udp_sk(sk); int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " @@ -1724,12 +1726,14 @@ static int sync_thread_backup(void *data) ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); while (!kthread_should_stop()) { - wait_event_interruptible(*sk_sleep(tinfo->sock->sk), - !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) - || kthread_should_stop()); + wait_event_interruptible(*sk_sleep(sk), + !skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue) || + kthread_should_stop()); /* do we have data now? */ - while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue)) { len = ip_vs_receive(tinfo->sock, tinfo->buf, ipvs->bcfg.sync_maxlen); if (len <= 0) { @@ -1762,6 +1766,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; + /* Do not hold one mutex and then to block on another */ for (;;) { rtnl_lock(); @@ -1892,9 +1900,6 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); - /* increase the module use count */ - ip_vs_use_count_inc(); - return 0; out: @@ -1924,11 +1929,17 @@ out: } kfree(ti); } + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; out_early: mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index e101eda05d55..0ee2c15b049b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -206,7 +206,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) struct rtable *ort = skb_rtable(skb); if (!skb->dev && sk && sk_fullsock(sk)) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); } static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e343a030ec26..63618729d84d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1381,9 +1381,9 @@ __nf_conntrack_alloc(struct net *net, ct->status = 0; ct->timeout = 0; write_pnet(&ct->ct_net, net); - memset(&ct->__nfct_init_offset[0], 0, + memset(&ct->__nfct_init_offset, 0, offsetof(struct nf_conn, proto) - - offsetof(struct nf_conn, __nfct_init_offset[0])); + offsetof(struct nf_conn, __nfct_init_offset)); nf_ct_zone_add(ct, zone); @@ -1792,8 +1792,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; - if (ct->timeout != extra_jiffies) - ct->timeout = extra_jiffies; + if (READ_ONCE(ct->timeout) != extra_jiffies) + WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); @@ -1881,22 +1881,18 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) nf_conntrack_get(skb_nfct(nskb)); } -static int nf_conntrack_update(struct net *net, struct sk_buff *skb) +static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; struct nf_nat_hook *nat_hook; unsigned int status; - struct nf_conn *ct; int dataoff; u16 l3num; u8 l4num; - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_confirmed(ct)) - return 0; - l3num = nf_ct_l3num(ct); dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); @@ -1953,6 +1949,78 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; } +/* This packet is coming from userspace via nf_queue, complete the packet + * processing after the helper invocation in nf_confirm(). + */ +static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + int protoff; + + help = nfct_help(ct); + if (!help) + return 0; + + helper = rcu_dereference(help->helper); + if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) + return 0; + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: { + __be16 frag_off; + u8 pnum; + + pnum = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return 0; + break; + } +#endif + default: + return 0; + } + + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return -1; + } + } + + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; +} + +static int nf_conntrack_update(struct net *net, struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int err; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + if (!nf_ct_is_confirmed(ct)) { + err = __nf_conntrack_update(net, skb, ct, ctinfo); + if (err < 0) + return err; + + ct = nf_ct_get(skb, &ctinfo); + } + + return nf_confirm_cthelper(skb, ct, ctinfo); +} + static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1b77444d5b52..47eb42649639 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3599,6 +3599,9 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) list_for_each_entry(net, net_exit_list, exit_list) ctnetlink_net_exit(net); + + /* wait for other cpus until they are done with ctnl_notifiers */ + synchronize_rcu(); } static struct pernet_operations ctnetlink_net_ops = { diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index b22042ad0fca..584c606e7fe0 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -72,24 +72,32 @@ EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) /* PptpControlMessageType names */ -const char *const pptp_msg_name[] = { - "UNKNOWN_MESSAGE", - "START_SESSION_REQUEST", - "START_SESSION_REPLY", - "STOP_SESSION_REQUEST", - "STOP_SESSION_REPLY", - "ECHO_REQUEST", - "ECHO_REPLY", - "OUT_CALL_REQUEST", - "OUT_CALL_REPLY", - "IN_CALL_REQUEST", - "IN_CALL_REPLY", - "IN_CALL_CONNECT", - "CALL_CLEAR_REQUEST", - "CALL_DISCONNECT_NOTIFY", - "WAN_ERROR_NOTIFY", - "SET_LINK_INFO" +static const char *const pptp_msg_name_array[PPTP_MSG_MAX + 1] = { + [0] = "UNKNOWN_MESSAGE", + [PPTP_START_SESSION_REQUEST] = "START_SESSION_REQUEST", + [PPTP_START_SESSION_REPLY] = "START_SESSION_REPLY", + [PPTP_STOP_SESSION_REQUEST] = "STOP_SESSION_REQUEST", + [PPTP_STOP_SESSION_REPLY] = "STOP_SESSION_REPLY", + [PPTP_ECHO_REQUEST] = "ECHO_REQUEST", + [PPTP_ECHO_REPLY] = "ECHO_REPLY", + [PPTP_OUT_CALL_REQUEST] = "OUT_CALL_REQUEST", + [PPTP_OUT_CALL_REPLY] = "OUT_CALL_REPLY", + [PPTP_IN_CALL_REQUEST] = "IN_CALL_REQUEST", + [PPTP_IN_CALL_REPLY] = "IN_CALL_REPLY", + [PPTP_IN_CALL_CONNECT] = "IN_CALL_CONNECT", + [PPTP_CALL_CLEAR_REQUEST] = "CALL_CLEAR_REQUEST", + [PPTP_CALL_DISCONNECT_NOTIFY] = "CALL_DISCONNECT_NOTIFY", + [PPTP_WAN_ERROR_NOTIFY] = "WAN_ERROR_NOTIFY", + [PPTP_SET_LINK_INFO] = "SET_LINK_INFO" }; + +const char *pptp_msg_name(u_int16_t msg) +{ + if (msg > PPTP_MSG_MAX) + return pptp_msg_name_array[0]; + + return pptp_msg_name_array[msg]; +} EXPORT_SYMBOL(pptp_msg_name); #endif @@ -276,7 +284,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; msg = ntohs(ctlh->messageType); - pr_debug("inbound control message %s\n", pptp_msg_name[msg]); + pr_debug("inbound control message %s\n", pptp_msg_name(msg)); switch (msg) { case PPTP_START_SESSION_REPLY: @@ -311,7 +319,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, pcid = pptpReq->ocack.peersCallID; if (info->pns_call_id != pcid) goto invalid; - pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], + pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name(msg), ntohs(cid), ntohs(pcid)); if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { @@ -328,7 +336,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, goto invalid; cid = pptpReq->icreq.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->cstate = PPTP_CALL_IN_REQ; info->pac_call_id = cid; break; @@ -347,7 +355,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, if (info->pns_call_id != pcid) goto invalid; - pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); + pr_debug("%s, PCID=%X\n", pptp_msg_name(msg), ntohs(pcid)); info->cstate = PPTP_CALL_IN_CONF; /* we expect a GRE connection from PAC to PNS */ @@ -357,7 +365,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, case PPTP_CALL_DISCONNECT_NOTIFY: /* server confirms disconnect */ cid = pptpReq->disc.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->cstate = PPTP_CALL_NONE; /* untrack this call id, unexpect GRE packets */ @@ -384,7 +392,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, invalid: pr_debug("invalid %s: type=%d cid=%u pcid=%u " "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + pptp_msg_name(msg), msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; @@ -404,7 +412,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; msg = ntohs(ctlh->messageType); - pr_debug("outbound control message %s\n", pptp_msg_name[msg]); + pr_debug("outbound control message %s\n", pptp_msg_name(msg)); switch (msg) { case PPTP_START_SESSION_REQUEST: @@ -426,7 +434,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, info->cstate = PPTP_CALL_OUT_REQ; /* track PNS call id */ cid = pptpReq->ocreq.callID; - pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid)); info->pns_call_id = cid; break; @@ -440,7 +448,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, pcid = pptpReq->icack.peersCallID; if (info->pac_call_id != pcid) goto invalid; - pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], + pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name(msg), ntohs(cid), ntohs(pcid)); if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { @@ -480,7 +488,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, invalid: pr_debug("invalid %s: type=%d cid=%u pcid=%u " "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + pptp_msg_name(msg), msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, ntohs(info->pns_call_id), ntohs(info->pac_call_id)); return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b6b14db3955b..b3f4a334f9d7 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -677,6 +677,9 @@ static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; int i; + if (!timeouts) + timeouts = dn->dccp_timeout; + /* set default DCCP timeouts. */ for (i=0; i<CT_DCCP_MAX; i++) timeouts[i] = dn->dccp_timeout[i]; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 522c08c23600..8ddcd5100638 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -114,7 +114,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, @@ -130,7 +130,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, @@ -316,7 +316,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } - ct->proto.sctp.state = new_state; + ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; @@ -594,6 +594,9 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; + if (!timeouts) + timeouts = sn->timeouts; + /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0006503d2da9..199e00cc6ec3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -411,7 +411,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } - + (*pos)++; return NULL; } diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 55106bebf2b5..750fd60dad6f 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -202,6 +202,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); @@ -218,7 +220,6 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index b9e7dd6e60ce..e92aa6b7eb80 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -189,6 +189,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + iph = ip_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v4.s_addr = iph->saddr; @@ -449,6 +450,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + ip6h = ipv6_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v6 = ip6h->saddr; diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 83a24cc5753b..28d3313db8f9 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -68,15 +68,13 @@ static bool udp_manip_pkt(struct sk_buff *skb, enum nf_nat_manip_type maniptype) { struct udphdr *hdr; - bool do_csum; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); - do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL; + __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); - __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, do_csum); return true; } @@ -233,6 +231,19 @@ icmp_manip_pkt(struct sk_buff *skb, return false; hdr = (struct icmphdr *)(skb->data + hdroff); + switch (hdr->type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMP_INFO_REQUEST: + case ICMP_INFO_REPLY: + case ICMP_ADDRESS: + case ICMP_ADDRESSREPLY: + break; + default: + return true; + } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; @@ -1022,8 +1033,8 @@ int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); if (ret) - nf_nat_ipv6_unregister_fn(net, ops); - + nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, + ARRAY_SIZE(nf_nat_ipv6_ops)); return ret; } EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 2c440015ff0c..3d8aca9ac06d 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -190,7 +190,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, goto err; } - if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) { + if (skb_dst(skb) && !skb_dst_force(skb)) { status = -ENETDOWN; goto err; } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8ce74ed985c0..969efcc725cb 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -270,7 +270,7 @@ static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu + 1; return per_cpu_ptr(snet->stats, cpu); } - + (*pos)++; return NULL; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4d8714d78192..dd8d4bb1145c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -21,6 +21,8 @@ #include <net/net_namespace.h> #include <net/sock.h> +#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) + static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); @@ -485,46 +487,70 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nft_chain_type * +__nft_chain_type_get(u8 family, enum nft_chain_types type) +{ + if (family >= NFPROTO_NUMPROTO || + type >= NFT_CHAIN_T_MAX) + return NULL; + + return chain_type[family][type]; +} + +static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { + const struct nft_chain_type *type; int i; for (i = 0; i < NFT_CHAIN_T_MAX; i++) { - if (chain_type[family][i] != NULL && - !nla_strcmp(nla, chain_type[family][i]->name)) - return chain_type[family][i]; + type = __nft_chain_type_get(family, i); + if (!type) + continue; + if (!nla_strcmp(nla, type->name)) + return type; } return NULL; } -/* - * Loading a module requires dropping mutex that guards the - * transaction. - * We first need to abort any pending transactions as once - * mutex is unlocked a different client could start a new - * transaction. It must not see any 'future generation' - * changes * as these changes will never happen. - */ -#ifdef CONFIG_MODULES -static int __nf_tables_abort(struct net *net); +struct nft_module_request { + struct list_head list; + char module[MODULE_NAME_LEN]; + bool done; +}; -static void nft_request_module(struct net *net, const char *fmt, ...) +#ifdef CONFIG_MODULES +static int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; + struct nft_module_request *req; va_list args; int ret; - __nf_tables_abort(net); - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); - if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) - return; + if (ret >= MODULE_NAME_LEN) + return 0; - mutex_unlock(&net->nft.commit_mutex); - request_module("%s", module_name); - mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(req, &net->nft.module_list, list) { + if (!strcmp(req->module, module_name)) { + if (req->done) + return 0; + + /* A request to load this module already exists. */ + return -EAGAIN; + } + } + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->done = false; + strlcpy(req->module, module_name, MODULE_NAME_LEN); + list_add_tail(&req->list, &net->nft.module_list); + + return -EAGAIN; } #endif @@ -548,10 +574,9 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { - nft_request_module(net, "nft-chain-%u-%.*s", family, - nla_len(nla), (const char *)nla_data(nla)); - type = __nf_tables_chain_type_lookup(nla, family); - if (type != NULL) + if (nft_request_module(net, "nft-chain-%u-%.*s", family, + nla_len(nla), + (const char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -976,12 +1001,18 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { + if (!nft_is_active_next(ctx->net, flowtable)) + continue; + err = nft_delflowtable(ctx, flowtable); if (err < 0) goto out; } list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { + if (!nft_is_active_next(ctx->net, obj)) + continue; + err = nft_delobj(ctx, obj); if (err < 0) goto out; @@ -1084,11 +1115,8 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) void nft_register_chain_type(const struct nft_chain_type *ctype) { - if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) - return; - nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) { + if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); return; } @@ -1172,7 +1200,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, - [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, }; @@ -1545,7 +1574,10 @@ static int nft_chain_parse_hook(struct net *net, hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = chain_type[family][NFT_CHAIN_T_DEFAULT]; + type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); + if (!type) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, autoload); @@ -2031,6 +2063,18 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, return NULL; } +#ifdef CONFIG_MODULES +static int nft_expr_type_request_module(struct net *net, u8 family, + struct nlattr *nla) +{ + if (nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) + return -EAGAIN; + + return 0; +} +#endif + static const struct nft_expr_type *nft_expr_type_get(struct net *net, u8 family, struct nlattr *nla) @@ -2047,14 +2091,12 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); - nft_request_module(net, "nft-expr-%.*s", - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%.*s", + nla_len(nla), + (char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -2062,7 +2104,8 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, } static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { - [NFTA_EXPR_NAME] = { .type = NLA_STRING }, + [NFTA_EXPR_NAME] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, }; @@ -2142,6 +2185,13 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, (const struct nlattr * const *)info->tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); +#ifdef CONFIG_MODULES + if (err == -EAGAIN) + if (nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]) != -EAGAIN) + err = -ENOENT; +#endif goto err1; } } else @@ -2967,8 +3017,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { - nft_request_module(ctx->net, "nft-set"); - if (!list_empty(&nf_tables_set_types)) + if (nft_request_module(ctx->net, "nft-set") == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -3519,7 +3568,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) - return -EINVAL; + return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == (NFT_SET_MAP | NFT_SET_OBJECT)) @@ -3557,7 +3606,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); if (objtype == NFT_OBJECT_UNSPEC || objtype > NFT_OBJECT_MAX) - return -EINVAL; + return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else @@ -3888,7 +3937,8 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, - [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4209,8 +4259,10 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { + nft_data_release(&elem.key.val, desc.type); return err; + } priv = set->ops->get(ctx->net, set, &elem, flags); if (IS_ERR(priv)) @@ -4443,14 +4495,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_DATA] == NULL && !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; - if (nla[NFTA_SET_ELEM_DATA] != NULL && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; } + if ((flags & NFT_SET_ELEM_INTERVAL_END) && + (nla[NFTA_SET_ELEM_DATA] || + nla[NFTA_SET_ELEM_OBJREF] || + nla[NFTA_SET_ELEM_TIMEOUT] || + nla[NFTA_SET_ELEM_EXPIRATION] || + nla[NFTA_SET_ELEM_USERDATA] || + nla[NFTA_SET_ELEM_EXPR])) + return -EINVAL; + timeout = 0; if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) @@ -5071,8 +5129,7 @@ nft_obj_type_get(struct net *net, u32 objtype) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-obj-%u", objtype); - if (__nft_obj_type_get(objtype)) + if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -5634,8 +5691,7 @@ nft_flowtable_type_get(struct net *net, u8 family) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nf-flowtable-%u", family); - if (__nft_flowtable_type_get(family)) + if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -6567,6 +6623,18 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nf_tables_module_autoload_cleanup(struct net *net) +{ + struct nft_module_request *req, *next; + + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!req->done); + list_del(&req->list); + kfree(req); + } +} + static void nf_tables_commit_release(struct net *net) { struct nft_trans *trans; @@ -6579,6 +6647,7 @@ static void nf_tables_commit_release(struct net *net) * to prevent expensive synchronize_rcu() in commit phase. */ if (list_empty(&net->nft.commit_list)) { + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); return; } @@ -6593,6 +6662,7 @@ static void nf_tables_commit_release(struct net *net) list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); schedule_work(&trans_destroy_work); @@ -6771,6 +6841,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } +static void nf_tables_module_autoload(struct net *net) +{ + struct nft_module_request *req, *next; + LIST_HEAD(module_list); + + list_splice_init(&net->nft.module_list, &module_list); + mutex_unlock(&net->nft.commit_mutex); + list_for_each_entry_safe(req, next, &module_list, list) { + request_module("%s", req->module); + req->done = true; + } + mutex_lock(&net->nft.commit_mutex); + list_splice(&module_list, &net->nft.module_list); +} + static void nf_tables_abort_release(struct nft_trans *trans) { switch (trans->msg_type) { @@ -6800,7 +6885,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int __nf_tables_abort(struct net *net) +static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -6917,6 +7002,11 @@ static int __nf_tables_abort(struct net *net) nf_tables_abort_release(trans); } + if (autoload) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + return 0; } @@ -6925,9 +7015,9 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb) +static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload) { - int ret = __nf_tables_abort(net); + int ret = __nf_tables_abort(net, autoload); mutex_unlock(&net->nft.commit_mutex); @@ -7522,6 +7612,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + INIT_LIST_HEAD(&net->nft.module_list); mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7533,10 +7624,11 @@ static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net); + __nf_tables_abort(net, false); __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); + WARN_ON_ONCE(!list_empty(&net->nft.module_list)); } static struct pernet_operations nf_tables_net_ops = { diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4abbb452cf6c..99127e2d95a8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -476,7 +476,7 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb); + ss->abort(net, oskb, true); nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); @@ -487,11 +487,11 @@ done: status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { - ss->abort(net, oskb); + ss->abort(net, oskb, false); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } } else { - ss->abort(net, oskb); + ss->abort(net, oskb, false); } if (ss->cleanup) ss->cleanup(net); diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 7525063c25f5..81406b93f126 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -103,7 +103,7 @@ nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) if (help->helper->data_len == 0) return -EINVAL; - nla_memcpy(help->data, nla_data(attr), sizeof(help->data)); + nla_memcpy(help->data, attr, sizeof(help->data)); return 0; } @@ -240,6 +240,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], ret = -ENOMEM; goto err2; } + helper->data_len = size; helper->flags |= NF_CT_HELPER_F_USERSPACE; memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); @@ -742,6 +743,8 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { [NFCTH_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, + [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, }, + [NFCTH_STATUS] = { .type = NLA_U32, }, }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index f42326b40d6f..2e7ced752ef1 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -166,12 +166,12 @@ static bool nf_osf_match_one(const struct sk_buff *skb, static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, - unsigned char *opts) + unsigned char *opts, + struct tcphdr *_tcph) { const struct tcphdr *tcp; - struct tcphdr _tcph; - tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; @@ -206,10 +206,11 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; + struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); - tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; @@ -266,10 +267,11 @@ bool nf_osf_find(const struct sk_buff *skb, const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; + struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); - tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index b310b637b550..143b13b28a59 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -79,7 +79,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (d1.len != priv->len) { + if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) { err = -EINVAL; goto err1; } @@ -88,7 +88,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_XOR]); if (err < 0) goto err1; - if (d2.len != priv->len) { + if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) { err = -EINVAL; goto err2; } diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index ff9ac8ae0031..eac4a901233f 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -89,6 +89,7 @@ static const struct nft_chain_type nft_chain_nat_inet = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_INET, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 411c0cf741e3..abd7be110f80 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -79,6 +79,12 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; + if (desc.type != NFT_DATA_VALUE) { + err = -EINVAL; + nft_data_release(&priv->data, desc.type); + return err; + } + priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); err = nft_validate_register_load(priv->sreg, desc.len); if (err < 0) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 01705ad74a9a..da21b771a259 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -179,9 +179,6 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct nft_flow_offload *priv = nft_expr_priv(expr); - - priv->flowtable->use--; nf_ct_netns_put(ctx->net, ctx->family); } diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 61b7f93ac681..42afcd2bc6f8 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -27,6 +27,9 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; + /* This is used by ifb only. */ + skb_set_redirected(pkt->skb, true); + nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; } @@ -179,6 +182,13 @@ nla_put_failure: return -1; } +static int nft_fwd_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); +} + static struct nft_expr_type nft_fwd_netdev_type; static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .type = &nft_fwd_netdev_type, @@ -186,6 +196,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .eval = nft_fwd_neigh_eval, .init = nft_fwd_neigh_init, .dump = nft_fwd_neigh_dump, + .validate = nft_fwd_validate, }; static const struct nft_expr_ops nft_fwd_netdev_ops = { @@ -194,6 +205,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .eval = nft_fwd_netdev_eval, .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, + .validate = nft_fwd_validate, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index a54329b8634a..b34231cd82ac 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -544,6 +544,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) return ERR_PTR(-EINVAL); +#if defined(CONFIG_NF_TABLES_BRIDGE) && IS_MODULE(CONFIG_NFT_BRIDGE_META) + if (ctx->family == NFPROTO_BRIDGE) + return ERR_PTR(-EAGAIN); +#endif if (tb[NFTA_META_DREG]) return &nft_meta_get_ops; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index c3c93e95b46e..243e8107f456 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -129,7 +129,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->type = NF_NAT_MANIP_DST; break; default: - return -EINVAL; + return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) @@ -196,7 +196,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (tb[NFTA_NAT_FLAGS]) { priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + return -EOPNOTSUPP; } return nf_ct_netns_get(ctx->net, family); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index f54d6ae15bb1..b42247aa48a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -61,6 +61,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; + if (!tb[NFTA_OSF_DREG]) + return -EINVAL; + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 680bd9f38a81..f2ab0066e7a7 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -118,6 +118,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 }, }; static int nft_payload_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 4701fa8a45e7..89efcc5a533d 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -66,11 +66,21 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr if (err < 0) return err; + if (desc_from.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err1; + } + err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), &desc_to, tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; + if (desc_to.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err2; + } + if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 419d58ef802b..34ae80ea96ae 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -33,6 +33,11 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); } +static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) +{ + return !nft_rbtree_interval_end(rbe); +} + static bool nft_rbtree_equal(const struct nft_set *set, const void *this, const struct nft_rbtree_elem *interval) { @@ -64,7 +69,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (interval && nft_rbtree_equal(set, this, interval) && nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(interval)) + nft_rbtree_interval_start(interval)) continue; interval = rbe; } else if (d > 0) @@ -74,8 +79,17 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(parent->rb_left); continue; } - if (nft_rbtree_interval_end(rbe)) - goto out; + + if (nft_set_elem_expired(&rbe->ext)) + return false; + + if (nft_rbtree_interval_end(rbe)) { + if (nft_set_is_anonymous(set)) + return false; + parent = rcu_dereference_raw(parent->rb_left); + interval = NULL; + continue; + } *ext = &rbe->ext; return true; @@ -84,11 +98,12 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_interval_end(interval)) { + !nft_set_elem_expired(&interval->ext) && + nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; } -out: + return false; } @@ -139,8 +154,13 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, if (flags & NFT_SET_ELEM_INTERVAL_END) interval = rbe; } else { - if (!nft_set_elem_active(&rbe->ext, genmask)) + if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = rcu_dereference_raw(parent->rb_left); + continue; + } + + if (nft_set_elem_expired(&rbe->ext)) + return false; if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == @@ -148,12 +168,17 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, *elem = rbe; return true; } - return false; + + if (nft_rbtree_interval_end(rbe)) + interval = NULL; + + parent = rcu_dereference_raw(parent->rb_left); } } if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && + !nft_set_elem_expired(&interval->ext) && ((!nft_rbtree_interval_end(interval) && !(flags & NFT_SET_ELEM_INTERVAL_END)) || (nft_rbtree_interval_end(interval) && @@ -213,9 +238,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, p = &parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(new)) { + nft_rbtree_interval_start(new)) { p = &parent->rb_left; - } else if (!nft_rbtree_interval_end(rbe) && + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(new)) { p = &parent->rb_right; } else if (nft_set_elem_active(&rbe->ext, genmask)) { @@ -306,10 +331,10 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(this)) { + nft_rbtree_interval_start(this)) { parent = parent->rb_left; continue; - } else if (!nft_rbtree_interval_end(rbe) && + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; @@ -339,6 +364,8 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; + if (nft_set_elem_expired(&rbe->ext)) + goto cont; if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f92a82c73880..95980154ef02 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -50,7 +50,7 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -117,7 +117,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 3d4c2ae605a8..1effd4878619 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -76,7 +76,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, struct nft_tunnel *priv = nft_expr_priv(expr); u32 len; - if (!tb[NFTA_TUNNEL_KEY] && + if (!tb[NFTA_TUNNEL_KEY] || !tb[NFTA_TUNNEL_DREG]) return -EINVAL; @@ -248,8 +248,9 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, } static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, - [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, }; @@ -266,6 +267,9 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, if (err < 0) return err; + if (!tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]) + return -EINVAL; + version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])); switch (version) { case ERSPAN_VERSION: @@ -335,6 +339,8 @@ static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, }, [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, }, [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, }, + [NFTA_TUNNEL_KEY_SPORT] = { .type = NLA_U16, }, + [NFTA_TUNNEL_KEY_DPORT] = { .type = NLA_U16, }, [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, }, }; @@ -501,8 +507,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, static int nft_tunnel_ports_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { - if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 || - nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0) + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 || + nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0) return -1; return 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index ce70c2576bb2..44f971f31992 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1551,6 +1551,9 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; + if (ppos != NULL) + ++(*ppos); + switch (trav->class) { case MTTG_TRAV_INIT: trav->class = MTTG_TRAV_NFP_UNSPEC; @@ -1576,9 +1579,6 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, default: return NULL; } - - if (ppos != NULL) - ++*ppos; return trav; } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 2d2691dd51e0..6be05be8ed47 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -36,6 +36,7 @@ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/xt_hashlimit.h> #include <linux/mutex.h> +#include <linux/refcount.h> #include <linux/kernel.h> MODULE_LICENSE("GPL"); @@ -109,7 +110,7 @@ struct dsthash_ent { struct xt_hashlimit_htable { struct hlist_node node; /* global list of all htables */ - int use; + refcount_t use; u_int8_t family; bool rnd_initialized; @@ -310,7 +311,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, for (i = 0; i < hinfo->cfg.size; i++) INIT_HLIST_HEAD(&hinfo->hash[i]); - hinfo->use = 1; + refcount_set(&hinfo->use, 1); hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; @@ -352,21 +353,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, return 0; } -static bool select_all(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return true; -} - -static bool select_gc(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return time_after_eq(jiffies, he->expires); -} - -static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, - bool (*select)(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he)) +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all) { unsigned int i; @@ -376,7 +363,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { - if ((*select)(ht, dh)) + if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } spin_unlock_bh(&ht->lock); @@ -390,7 +377,7 @@ static void htable_gc(struct work_struct *work) ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); - htable_selective_cleanup(ht, select_gc); + htable_selective_cleanup(ht, false); queue_delayed_work(system_power_efficient_wq, &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); @@ -410,15 +397,6 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) remove_proc_entry(hinfo->name, parent); } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) -{ - cancel_delayed_work_sync(&hinfo->gc_work); - htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, select_all); - kfree(hinfo->name); - vfree(hinfo); -} - static struct xt_hashlimit_htable *htable_find_get(struct net *net, const char *name, u_int8_t family) @@ -429,7 +407,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { if (!strcmp(name, hinfo->name) && hinfo->family == family) { - hinfo->use++; + refcount_inc(&hinfo->use); return hinfo; } } @@ -438,12 +416,16 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, static void htable_put(struct xt_hashlimit_htable *hinfo) { - mutex_lock(&hashlimit_mutex); - if (--hinfo->use == 0) { + if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) { hlist_del(&hinfo->node); - htable_destroy(hinfo); + htable_remove_proc_entry(hinfo); + mutex_unlock(&hashlimit_mutex); + + cancel_delayed_work_sync(&hinfo->gc_work); + htable_selective_cleanup(hinfo, true); + kfree(hinfo->name); + vfree(hinfo); } - mutex_unlock(&hashlimit_mutex); } /* The algorithm used is the Simple Token Bucket Filter (TBF) @@ -846,6 +828,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3); } +#define HASHLIMIT_MAX_SIZE 1048576 + static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, struct xt_hashlimit_htable **hinfo, struct hashlimit_cfg3 *cfg, @@ -856,6 +840,14 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, if (cfg->gc_interval == 0 || cfg->expire == 0) return -EINVAL; + if (cfg->size > HASHLIMIT_MAX_SIZE) { + cfg->size = HASHLIMIT_MAX_SIZE; + pr_info_ratelimited("size too large, truncated to %u\n", cfg->size); + } + if (cfg->max > HASHLIMIT_MAX_SIZE) { + cfg->max = HASHLIMIT_MAX_SIZE; + pr_info_ratelimited("max too large, truncated to %u\n", cfg->max); + } if (par->family == NFPROTO_IPV4) { if (cfg->srcmask > 32 || cfg->dstmask > 32) return -EINVAL; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 781e0b482189..6c2582a19766 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -492,12 +492,12 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) const struct recent_entry *e = v; const struct list_head *head = e->list.next; + (*pos)++; while (head == &t->iphash[st->bucket]) { if (++st->bucket >= ip_list_hash_size) return NULL; head = t->iphash[st->bucket].next; } - (*pos)++; return list_entry(head, struct recent_entry, list); } |