aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c152
1 files changed, 100 insertions, 52 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bad144dfabc5..b2e4c4b192e6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
smp_wmb();
}
-static int __packet_get_status(struct packet_sock *po, void *frame)
+static int __packet_get_status(const struct packet_sock *po, void *frame)
{
union tpacket_uhdr h;
@@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
return ts_status;
}
-static void *packet_lookup_frame(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int position,
- int status)
+static void *packet_lookup_frame(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int position,
+ int status)
{
unsigned int pg_vec_pos, frame_offset;
union tpacket_uhdr h;
@@ -544,7 +544,8 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
msec = 1;
div = ecmd.base.speed / 1000;
}
- }
+ } else
+ return DEFAULT_PRB_RETIRE_TOV;
mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
@@ -758,7 +759,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
struct sock *sk = &po->sk;
- if (po->stats.stats3.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev;
@@ -1083,10 +1084,10 @@ static void *packet_current_rx_frame(struct packet_sock *po,
}
}
-static void *prb_lookup_block(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int idx,
- int status)
+static void *prb_lookup_block(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int idx,
+ int status)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
@@ -1199,12 +1200,12 @@ static void packet_free_pending(struct packet_sock *po)
#define ROOM_LOW 0x1
#define ROOM_NORMAL 0x2
-static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.frame_max + 1;
- idx = po->rx_ring.head;
+ len = READ_ONCE(po->rx_ring.frame_max) + 1;
+ idx = READ_ONCE(po->rx_ring.head);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1212,12 +1213,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.prb_bdqc.knum_blocks;
- idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+ len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
+ idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1225,15 +1226,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+static int __packet_rcv_has_room(const struct packet_sock *po,
+ const struct sk_buff *skb)
{
- struct sock *sk = &po->sk;
+ const struct sock *sk = &po->sk;
int ret = ROOM_NONE;
if (po->prot_hook.func != tpacket_rcv) {
- int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- - (skb ? skb->truesize : 0);
- if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+ int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+ - (skb ? skb->truesize : 0);
+
+ if (avail > (rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL;
else if (avail > 0)
return ROOM_LOW;
@@ -1258,19 +1262,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
- int ret;
- bool has_room;
+ int pressure, ret;
- spin_lock_bh(&po->sk.sk_receive_queue.lock);
ret = __packet_rcv_has_room(po, skb);
- has_room = ret == ROOM_NORMAL;
- if (po->pressure == has_room)
- po->pressure = !has_room;
- spin_unlock_bh(&po->sk.sk_receive_queue.lock);
+ pressure = ret != ROOM_NORMAL;
+
+ if (READ_ONCE(po->pressure) != pressure)
+ WRITE_ONCE(po->pressure, pressure);
return ret;
}
+static void packet_rcv_try_clear_pressure(struct packet_sock *po)
+{
+ if (READ_ONCE(po->pressure) &&
+ __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
+ WRITE_ONCE(po->pressure, 0);
+}
+
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
@@ -1288,15 +1297,21 @@ static void packet_sock_destruct(struct sock *sk)
static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
{
- u32 rxhash;
+ u32 *history = po->rollover->history;
+ u32 victim, rxhash;
int i, count = 0;
rxhash = skb_get_hash(skb);
for (i = 0; i < ROLLOVER_HLEN; i++)
- if (po->rollover->history[i] == rxhash)
+ if (READ_ONCE(history[i]) == rxhash)
count++;
- po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
+ victim = prandom_u32() % ROLLOVER_HLEN;
+
+ /* Avoid dirtying the cache line if possible */
+ if (READ_ONCE(history[victim]) != rxhash)
+ WRITE_ONCE(history[victim], rxhash);
+
return count > (ROLLOVER_HLEN >> 1);
}
@@ -1351,7 +1366,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1);
do {
po_next = pkt_sk(f->arr[i]);
- if (po_next != po_skip && !po_next->pressure &&
+ if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
@@ -2126,10 +2141,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
is_drop_n_account = true;
- spin_lock(&sk->sk_receive_queue.lock);
- po->stats.stats1.tp_drops++;
+ atomic_inc(&po->tp_drops);
atomic_inc(&sk->sk_drops);
- spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
@@ -2155,11 +2168,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
int skb_len = skb->len;
unsigned int snaplen, res;
unsigned long status = TP_STATUS_USER;
- unsigned short macoff, netoff, hdrlen;
+ unsigned short macoff, hdrlen;
+ unsigned int netoff;
struct sk_buff *copy_skb = NULL;
struct timespec ts;
__u32 ts_status;
bool is_drop_n_account = false;
+ unsigned int slot_id = 0;
bool do_vnet = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
@@ -2193,6 +2208,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!res)
goto drop_n_restore;
+ /* If we are flooded, just give up */
+ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
+ atomic_inc(&po->tp_drops);
+ goto drop_n_restore;
+ }
+
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING &&
@@ -2217,6 +2238,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
}
macoff = netoff - maclen;
}
+ if (netoff > USHRT_MAX) {
+ atomic_inc(&po->tp_drops);
+ goto drop_n_restore;
+ }
if (po->tp_version <= TPACKET_V2) {
if (macoff + snaplen > po->rx_ring.frame_size) {
if (po->copy_thresh &&
@@ -2255,6 +2280,20 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
TP_STATUS_KERNEL, (macoff+snaplen));
if (!h.raw)
goto drop_n_account;
+
+ if (po->tp_version <= TPACKET_V2) {
+ slot_id = po->rx_ring.head;
+ if (test_bit(slot_id, po->rx_ring.rx_owner_map))
+ goto drop_n_account;
+ __set_bit(slot_id, po->rx_ring.rx_owner_map);
+ }
+
+ if (do_vnet &&
+ virtio_net_hdr_from_skb(skb, h.raw + macoff -
+ sizeof(struct virtio_net_hdr),
+ vio_le(), true, 0))
+ goto drop_n_account;
+
if (po->tp_version <= TPACKET_V2) {
packet_increment_rx_head(po, &po->rx_ring);
/*
@@ -2263,16 +2302,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
* Anyways, moving it for V1/V2 only as V3 doesn't need this
* at packet level.
*/
- if (po->stats.stats1.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
}
- if (do_vnet &&
- virtio_net_hdr_from_skb(skb, h.raw + macoff -
- sizeof(struct virtio_net_hdr),
- vio_le(), true, 0))
- goto drop_n_account;
-
po->stats.stats1.tp_packets++;
if (copy_skb) {
status |= TP_STATUS_COPY;
@@ -2360,7 +2393,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
#endif
if (po->tp_version <= TPACKET_V2) {
+ spin_lock(&sk->sk_receive_queue.lock);
__packet_set_status(po, h.raw, status);
+ __clear_bit(slot_id, po->rx_ring.rx_owner_map);
+ spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
} else {
prb_clear_blk_fill_status(&po->rx_ring);
@@ -2379,9 +2415,9 @@ drop:
return 0;
drop_n_account:
- is_drop_n_account = true;
- po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
+ atomic_inc(&po->tp_drops);
+ is_drop_n_account = true;
sk->sk_data_ready(sk);
kfree_skb(copy_skb);
@@ -3325,8 +3361,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL)
goto out;
- if (pkt_sk(sk)->pressure)
- packet_rcv_has_room(pkt_sk(sk), NULL);
+ packet_rcv_try_clear_pressure(pkt_sk(sk));
if (pkt_sk(sk)->has_vnet_hdr) {
err = packet_rcv_vnet(msg, skb, &len);
@@ -3898,6 +3933,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
void *data = &val;
union tpacket_stats_u st;
struct tpacket_rollover_stats rstats;
+ int drops;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
@@ -3914,14 +3950,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
memcpy(&st, &po->stats, sizeof(st));
memset(&po->stats, 0, sizeof(po->stats));
spin_unlock_bh(&sk->sk_receive_queue.lock);
+ drops = atomic_xchg(&po->tp_drops, 0);
if (po->tp_version == TPACKET_V3) {
lv = sizeof(struct tpacket_stats_v3);
- st.stats3.tp_packets += st.stats3.tp_drops;
+ st.stats3.tp_drops = drops;
+ st.stats3.tp_packets += drops;
data = &st.stats3;
} else {
lv = sizeof(struct tpacket_stats);
- st.stats1.tp_packets += st.stats1.tp_drops;
+ st.stats1.tp_drops = drops;
+ st.stats1.tp_packets += drops;
data = &st.stats1;
}
@@ -4140,8 +4179,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL))
mask |= EPOLLIN | EPOLLRDNORM;
}
- if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
- po->pressure = 0;
+ packet_rcv_try_clear_pressure(po);
spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) {
@@ -4255,6 +4293,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
+ unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
@@ -4340,6 +4379,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
break;
default:
+ if (!tx_ring) {
+ rx_owner_map = bitmap_alloc(req->tp_frame_nr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
+ if (!rx_owner_map)
+ goto out_free_pg_vec;
+ }
break;
}
}
@@ -4369,6 +4414,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec);
+ if (po->tp_version <= TPACKET_V2)
+ swap(rb->rx_owner_map, rx_owner_map);
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
@@ -4400,6 +4447,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
out_free_pg_vec:
+ bitmap_free(rx_owner_map);
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out: