summaryrefslogtreecommitdiff
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c395
1 files changed, 283 insertions, 112 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8d289697cc7a..7b3df0d518ab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -257,16 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
kmemcheck_annotate_variable(shinfo->destructor_arg);
if (flags & SKB_ALLOC_FCLONE) {
- struct sk_buff *child = skb + 1;
- atomic_t *fclone_ref = (atomic_t *) (child + 1);
+ struct sk_buff_fclones *fclones;
- kmemcheck_annotate_bitfield(child, flags1);
- kmemcheck_annotate_bitfield(child, flags2);
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
skb->fclone = SKB_FCLONE_ORIG;
- atomic_set(fclone_ref, 1);
+ atomic_set(&fclones->fclone_ref, 1);
- child->fclone = SKB_FCLONE_UNAVAILABLE;
- child->pfmemalloc = pfmemalloc;
+ fclones->skb2.fclone = SKB_FCLONE_FREE;
+ fclones->skb2.pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -491,32 +491,33 @@ static void skb_free_head(struct sk_buff *skb)
static void skb_release_data(struct sk_buff *skb)
{
- if (!skb->cloned ||
- !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
- &skb_shinfo(skb)->dataref)) {
- if (skb_shinfo(skb)->nr_frags) {
- int i;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- skb_frag_unref(skb, i);
- }
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int i;
- /*
- * If skb buf is from userspace, we need to notify the caller
- * the lower device DMA has done;
- */
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- struct ubuf_info *uarg;
+ if (skb->cloned &&
+ atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+ &shinfo->dataref))
+ return;
- uarg = skb_shinfo(skb)->destructor_arg;
- if (uarg->callback)
- uarg->callback(uarg, true);
- }
+ for (i = 0; i < shinfo->nr_frags; i++)
+ __skb_frag_unref(&shinfo->frags[i]);
- if (skb_has_frag_list(skb))
- skb_drop_fraglist(skb);
+ /*
+ * If skb buf is from userspace, we need to notify the caller
+ * the lower device DMA has done;
+ */
+ if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
- skb_free_head(skb);
+ uarg = shinfo->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, true);
}
+
+ if (shinfo->frag_list)
+ kfree_skb_list(shinfo->frag_list);
+
+ skb_free_head(skb);
}
/*
@@ -524,8 +525,7 @@ static void skb_release_data(struct sk_buff *skb)
*/
static void kfree_skbmem(struct sk_buff *skb)
{
- struct sk_buff *other;
- atomic_t *fclone_ref;
+ struct sk_buff_fclones *fclones;
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
@@ -533,22 +533,28 @@ static void kfree_skbmem(struct sk_buff *skb)
break;
case SKB_FCLONE_ORIG:
- fclone_ref = (atomic_t *) (skb + 2);
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, skb);
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+ if (atomic_dec_and_test(&fclones->fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, fclones);
break;
case SKB_FCLONE_CLONE:
- fclone_ref = (atomic_t *) (skb + 1);
- other = skb - 1;
+ fclones = container_of(skb, struct sk_buff_fclones, skb2);
- /* The clone portion is available for
- * fast-cloning again.
+ /* Warning : We must perform the atomic_dec_and_test() before
+ * setting skb->fclone back to SKB_FCLONE_FREE, otherwise
+ * skb_clone() could set clone_ref to 2 before our decrement.
+ * Anyway, if we are going to free the structure, no need to
+ * rewrite skb->fclone.
*/
- skb->fclone = SKB_FCLONE_UNAVAILABLE;
-
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, other);
+ if (atomic_dec_and_test(&fclones->fclone_ref)) {
+ kmem_cache_free(skbuff_fclone_cache, fclones);
+ } else {
+ /* The clone portion is available for
+ * fast-cloning again.
+ */
+ skb->fclone = SKB_FCLONE_FREE;
+ }
break;
}
}
@@ -566,7 +572,7 @@ static void skb_release_head_state(struct sk_buff *skb)
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb->nfct);
#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nf_bridge_put(skb->nf_bridge);
#endif
/* XXX: IS this still necessary? - JHS */
@@ -674,57 +680,61 @@ void consume_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(consume_skb);
+/* Make sure a field is enclosed inside headers_start/headers_end section */
+#define CHECK_SKB_FIELD(field) \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
+ offsetof(struct sk_buff, headers_start)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
+ offsetof(struct sk_buff, headers_end)); \
+
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
new->tstamp = old->tstamp;
+ /* We do not copy old->sk */
new->dev = old->dev;
- new->transport_header = old->transport_header;
- new->network_header = old->network_header;
- new->mac_header = old->mac_header;
- new->inner_protocol = old->inner_protocol;
- new->inner_transport_header = old->inner_transport_header;
- new->inner_network_header = old->inner_network_header;
- new->inner_mac_header = old->inner_mac_header;
+ memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
- skb_copy_hash(new, old);
- new->ooo_okay = old->ooo_okay;
- new->no_fcs = old->no_fcs;
- new->encapsulation = old->encapsulation;
- new->encap_hdr_csum = old->encap_hdr_csum;
- new->csum_valid = old->csum_valid;
- new->csum_complete_sw = old->csum_complete_sw;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
- memcpy(new->cb, old->cb, sizeof(old->cb));
- new->csum = old->csum;
- new->ignore_df = old->ignore_df;
- new->pkt_type = old->pkt_type;
- new->ip_summed = old->ip_summed;
- skb_copy_queue_mapping(new, old);
- new->priority = old->priority;
-#if IS_ENABLED(CONFIG_IP_VS)
- new->ipvs_property = old->ipvs_property;
+ __nf_copy(new, old, false);
+
+ /* Note : this field could be in headers_start/headers_end section
+ * It is not yet because we do not want to have a 16 bit hole
+ */
+ new->queue_mapping = old->queue_mapping;
+
+ memcpy(&new->headers_start, &old->headers_start,
+ offsetof(struct sk_buff, headers_end) -
+ offsetof(struct sk_buff, headers_start));
+ CHECK_SKB_FIELD(protocol);
+ CHECK_SKB_FIELD(csum);
+ CHECK_SKB_FIELD(hash);
+ CHECK_SKB_FIELD(priority);
+ CHECK_SKB_FIELD(skb_iif);
+ CHECK_SKB_FIELD(vlan_proto);
+ CHECK_SKB_FIELD(vlan_tci);
+ CHECK_SKB_FIELD(transport_header);
+ CHECK_SKB_FIELD(network_header);
+ CHECK_SKB_FIELD(mac_header);
+ CHECK_SKB_FIELD(inner_protocol);
+ CHECK_SKB_FIELD(inner_transport_header);
+ CHECK_SKB_FIELD(inner_network_header);
+ CHECK_SKB_FIELD(inner_mac_header);
+ CHECK_SKB_FIELD(mark);
+#ifdef CONFIG_NETWORK_SECMARK
+ CHECK_SKB_FIELD(secmark);
+#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ CHECK_SKB_FIELD(napi_id);
#endif
- new->pfmemalloc = old->pfmemalloc;
- new->protocol = old->protocol;
- new->mark = old->mark;
- new->skb_iif = old->skb_iif;
- __nf_copy(new, old);
#ifdef CONFIG_NET_SCHED
- new->tc_index = old->tc_index;
+ CHECK_SKB_FIELD(tc_index);
#ifdef CONFIG_NET_CLS_ACT
- new->tc_verd = old->tc_verd;
+ CHECK_SKB_FIELD(tc_verd);
#endif
#endif
- new->vlan_proto = old->vlan_proto;
- new->vlan_tci = old->vlan_tci;
- skb_copy_secmark(new, old);
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
- new->napi_id = old->napi_id;
-#endif
}
/*
@@ -855,17 +865,22 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
- struct sk_buff *n;
+ struct sk_buff_fclones *fclones = container_of(skb,
+ struct sk_buff_fclones,
+ skb1);
+ struct sk_buff *n = &fclones->skb2;
if (skb_orphan_frags(skb, gfp_mask))
return NULL;
- n = skb + 1;
if (skb->fclone == SKB_FCLONE_ORIG &&
- n->fclone == SKB_FCLONE_UNAVAILABLE) {
- atomic_t *fclone_ref = (atomic_t *) (n + 1);
+ n->fclone == SKB_FCLONE_FREE) {
n->fclone = SKB_FCLONE_CLONE;
- atomic_inc(fclone_ref);
+ /* As our fastclone was free, clone_ref must be 1 at this point.
+ * We could use atomic_inc() here, but it is faster
+ * to set the final value.
+ */
+ atomic_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -875,7 +890,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
return NULL;
kmemcheck_annotate_bitfield(n, flags1);
- kmemcheck_annotate_bitfield(n, flags2);
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
@@ -3069,6 +3083,11 @@ perform_csum_check:
}
} while ((offset += len) < head_skb->len);
+ /* Some callers want to get the end of the list.
+ * Put it in segs->prev to avoid walking the list.
+ * (see validate_xmit_skb_list() for example)
+ */
+ segs->prev = tail;
return segs;
err:
@@ -3182,7 +3201,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
skb_shinfo(nskb)->frag_list = p;
skb_shinfo(nskb)->gso_size = pinfo->gso_size;
pinfo->gso_size = 0;
- skb_header_release(p);
+ __skb_header_release(p);
NAPI_GRO_CB(nskb)->last = p;
nskb->data_len += p->len;
@@ -3214,7 +3233,7 @@ merge:
else
NAPI_GRO_CB(p)->last->next = skb;
NAPI_GRO_CB(p)->last = skb;
- skb_header_release(skb);
+ __skb_header_release(skb);
lp = p;
done:
@@ -3230,7 +3249,6 @@ done:
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
-EXPORT_SYMBOL_GPL(skb_gro_receive);
void __init skb_init(void)
{
@@ -3240,8 +3258,7 @@ void __init skb_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
- (2*sizeof(struct sk_buff)) +
- sizeof(atomic_t),
+ sizeof(struct sk_buff_fclones),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
@@ -3494,32 +3511,66 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
-void __skb_tstamp_tx(struct sk_buff *orig_skb,
- struct skb_shared_hwtstamps *hwtstamps,
- struct sock *sk, int tstype)
+struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
- struct sock_exterr_skb *serr;
- struct sk_buff *skb;
- int err;
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *skb_next;
+ int err = 0;
- if (!sk)
- return;
+ spin_lock_bh(&q->lock);
+ skb = __skb_dequeue(q);
+ if (skb && (skb_next = skb_peek(q)))
+ err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ spin_unlock_bh(&q->lock);
- if (hwtstamps) {
- *skb_hwtstamps(orig_skb) =
- *hwtstamps;
- } else {
- /*
- * no hardware time stamps available,
- * so keep the shared tx_flags and only
- * store software time stamp
- */
- orig_skb->tstamp = ktime_get_real();
+ sk->sk_err = err;
+ if (err)
+ sk->sk_error_report(sk);
+
+ return skb;
+}
+EXPORT_SYMBOL(sock_dequeue_err_skb);
+
+/**
+ * skb_clone_sk - create clone of skb, and take reference to socket
+ * @skb: the skb to clone
+ *
+ * This function creates a clone of a buffer that holds a reference on
+ * sk_refcnt. Buffers created via this function are meant to be
+ * returned using sock_queue_err_skb, or free via kfree_skb.
+ *
+ * When passing buffers allocated with this function to sock_queue_err_skb
+ * it is necessary to wrap the call with sock_hold/sock_put in order to
+ * prevent the socket from being released prior to being enqueued on
+ * the sk_error_queue.
+ */
+struct sk_buff *skb_clone_sk(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct sk_buff *clone;
+
+ if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return NULL;
}
- skb = skb_clone(orig_skb, GFP_ATOMIC);
- if (!skb)
- return;
+ clone->sk = sk;
+ clone->destructor = sock_efree;
+
+ return clone;
+}
+EXPORT_SYMBOL(skb_clone_sk);
+
+static void __skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct sock *sk,
+ int tstype)
+{
+ struct sock_exterr_skb *serr;
+ int err;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
@@ -3537,6 +3588,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (err)
kfree_skb(skb);
}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = skb->sk;
+
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
+
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ struct sk_buff *skb;
+
+ if (!sk)
+ return;
+
+ if (hwtstamps)
+ *skb_hwtstamps(orig_skb) = *hwtstamps;
+ else
+ orig_skb->tstamp = ktime_get_real();
+
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ __skb_complete_tx_timestamp(skb, sk, tstype);
+}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
@@ -3561,9 +3648,14 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
err = sock_queue_err_skb(sk, skb);
if (err)
kfree_skb(skb);
+
+ sock_put(sk);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
@@ -3864,7 +3956,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
return false;
if (len <= skb_tailroom(to)) {
- BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ if (len)
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
return true;
}
@@ -4029,3 +4122,81 @@ err_free:
return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);
+
+/**
+ * alloc_skb_with_frags - allocate skb with page frags
+ *
+ * header_len: size of linear part
+ * data_len: needed length in frags
+ * max_page_order: max page order desired.
+ * errcode: pointer to error code if any
+ * gfp_mask: allocation mask
+ *
+ * This can be used to allocate a paged skb, given a maximal order for frags.
+ */
+struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
+ unsigned long data_len,
+ int max_page_order,
+ int *errcode,
+ gfp_t gfp_mask)
+{
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ unsigned long chunk;
+ struct sk_buff *skb;
+ struct page *page;
+ gfp_t gfp_head;
+ int i;
+
+ *errcode = -EMSGSIZE;
+ /* Note this test could be relaxed, if we succeed to allocate
+ * high order pages...
+ */
+ if (npages > MAX_SKB_FRAGS)
+ return NULL;
+
+ gfp_head = gfp_mask;
+ if (gfp_head & __GFP_WAIT)
+ gfp_head |= __GFP_REPEAT;
+
+ *errcode = -ENOBUFS;
+ skb = alloc_skb(header_len, gfp_head);
+ if (!skb)
+ return NULL;
+
+ skb->truesize += npages << PAGE_SHIFT;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages(gfp_mask |
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
+ order);
+ if (page)
+ goto fill_page;
+ /* Do not retry other high order allocations */
+ order = 1;
+ max_page_order = 0;
+ }
+ order--;
+ }
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
+ }
+ return skb;
+
+failure:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_skb_with_frags);