From def8b4faff5ca349beafbbfeb2c51f3602a6ef3a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 28 Oct 2008 13:24:06 -0700 Subject: net: reduce structures when XFRM=n ifdef out * struct sk_buff::sp (pointer) * struct dst_entry::xfrm (pointer) * struct sock::sk_policy (2 pointers) Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4e22e3a35359..cdfe473181af 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -489,7 +489,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->network_header = old->network_header; new->mac_header = old->mac_header; new->dst = dst_clone(old->dst); -#ifdef CONFIG_INET +#ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif memcpy(new->cb, old->cb, sizeof(old->cb)); -- cgit v1.2.3 From 8b30b1fe368ab03049435884c11c5c50e4c4ef0b Mon Sep 17 00:00:00 2001 From: Sujith Date: Fri, 24 Oct 2008 09:55:27 +0530 Subject: mac80211: Re-enable aggregation Wireless HW without any dedicated queues for aggregation do not need the ampdu_queues mechanism present right now in mac80211. Since mac80211 is still incomplete wrt TX MQ changes, do not allow aggregation sessions for drivers that set ampdu_queues. This is only an interim hack until Intel fixes the requeue issue. Signed-off-by: Sujith Signed-off-by: Luis Rodriguez Signed-off-by: John W. Linville --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cdfe473181af..c4c8a33f3418 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -544,6 +544,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(truesize); #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) C(do_not_encrypt); + C(requeue); #endif atomic_set(&n->users, 1); -- cgit v1.2.3 From 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:20:15 -0800 Subject: tcp: Try to restore large SKBs while SACK processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During SACK processing, most of the benefits of TSO are eaten by the SACK blocks that one-by-one fragment SKBs to MSS sized chunks. Then we're in problems when cleanup work for them has to be done when a large cumulative ACK comes. Try to return back to pre-split state already while more and more SACK info gets discovered by combining newly discovered SACK areas with the previous skb if that's SACKed as well. This approach has a number of benefits: 1) The processing overhead is spread more equally over the RTT 2) Write queue has less skbs to process (affect everything which has to walk in the queue past the sacked areas) 3) Write queue is consistent whole the time, so no other parts of TCP has to be aware of this (this was not the case with some other approach that was, well, quite intrusive all around). 4) Clean_rtx_queue can release most of the pages using single put_page instead of previous PAGE_SIZE/mss+1 calls In case a hole is fully filled by the new SACK block, we attempt to combine the next skb too which allows construction of skbs that are even larger than what tso split them to and it handles hole per on every nth patterns that often occur during slow start overshoot pretty nicely. Though this to be really useful also a retransmission would have to get lost since cumulative ACKs advance one hole at a time in the most typical case. TODO: handle upwards only merging. That should be rather easy when segment is fully sacked but I'm leaving that as future work item (it won't make very large difference anyway since this current approach already covers quite a lot of normal cases). I was earlier thinking of some sophisticated way of tracking timestamps of the first and the last segment but later on realized that it won't be that necessary at all to store the timestamp of the last segment. The cases that can occur are basically either: 1) ambiguous => no sensible measurement can be taken anyway 2) non-ambiguous is due to reordering => having the timestamp of the last segment there is just skewing things more off than does some good since the ack got triggered by one of the holes (besides some substle issues that would make determining right hole/skb even harder problem). Anyway, it has nothing to do with this change then. I choose to route some abnormal looking cases with goto noop, some could be handled differently (eg., by stopping the walking at that skb but again). In general, they either shouldn't happen at all or are rare enough to make no difference in practice. In theory this change (as whole) could cause some macroscale regression (global) because of cache misses that are taken over the round-trip time but it gets very likely better because of much less (local) cache misses per other write queue walkers and the big recovery clearing cumulative ack. Worth to note that these benefits would be very easy to get also without TSO/GSO being on as long as the data is in pages so that we can merge them. Currently I won't let that happen because DSACK splitting at fragment that would mess up pcounts due to sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets avoided, we have some conditions that can be made less strict. TODO: I will probably have to convert the excessive pointer passing to struct sacktag_state... :-) My testing revealed that considerable amount of skbs couldn't be shifted because they were cloned (most likely still awaiting tx reclaim)... [The rest is considering future work instead since I got repeatably EFAULT to tcpdump's recvfrom when I added pskb_expand_head to deal with clones, so I separated that into another, later patch] ...To counter that, I gave up on the fifth advantage: 5) When growing previous SACK block, less allocs for new skbs are done, basically a new alloc is needed only when new hole is detected and when the previous skb runs out of frags space ...which now only happens of if reclaim is fast enough to dispose the clone before the SACK block comes in (the window is RTT long), otherwise we'll have to alloc some. With clones being handled I got these numbers (will be somewhat worse without that), taken with fine-grained mibs: TCPSackShifted 398 TCPSackMerged 877 TCPSackShiftFallback 320 TCPSACKCOLLAPSEFALLBACKGSO 0 TCPSACKCOLLAPSEFALLBACKSKBBITS 0 TCPSACKCOLLAPSEFALLBACKSKBDATA 0 TCPSACKCOLLAPSEFALLBACKBELOW 0 TCPSACKCOLLAPSEFALLBACKFIRST 1 TCPSACKCOLLAPSEFALLBACKPREVBITS 318 TCPSACKCOLLAPSEFALLBACKMSS 1 TCPSACKCOLLAPSEFALLBACKNOHEAD 0 TCPSACKCOLLAPSEFALLBACKSHIFT 0 TCPSACKCOLLAPSENOOPSEQ 0 TCPSACKCOLLAPSENOOPSMALLPCOUNT 0 TCPSACKCOLLAPSENOOPSMALLLEN 0 TCPSACKCOLLAPSEHOLE 12 Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/core/skbuff.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 267185a848f6..844b8abeb18c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2018,6 +2018,146 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_no_header(skb, skb1, len, pos); } +/* Shifting from/to a cloned skb is a no-go. + * + * TODO: handle cloned skbs by using pskb_expand_head() + */ +static int skb_prepare_for_shift(struct sk_buff *skb) +{ + return skb_cloned(skb); +} + +/** + * skb_shift - Shifts paged data partially from skb to another + * @tgt: buffer into which tail data gets added + * @skb: buffer from which the paged data comes from + * @shiftlen: shift up to this many bytes + * + * Attempts to shift up to shiftlen worth of bytes, which may be less than + * the length of the skb, from tgt to skb. Returns number bytes shifted. + * It's up to caller to free skb if everything was shifted. + * + * If @tgt runs out of frags, the whole operation is aborted. + * + * Skb cannot include anything else but paged data while tgt is allowed + * to have non-paged data as well. + * + * TODO: full sized shift could be optimized but that would need + * specialized skb free'er to handle frags without up-to-date nr_frags. + */ +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) +{ + int from, to, merge, todo; + struct skb_frag_struct *fragfrom, *fragto; + + BUG_ON(shiftlen > skb->len); + BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + todo = shiftlen; + from = 0; + to = skb_shinfo(tgt)->nr_frags; + fragfrom = &skb_shinfo(skb)->frags[from]; + + /* Actual merge is delayed until the point when we know we can + * commit all, so that we don't have to undo partial changes + */ + if (!to || + !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { + merge = -1; + } else { + merge = to - 1; + + todo -= fragfrom->size; + if (todo < 0) { + if (skb_prepare_for_shift(skb) || + skb_prepare_for_shift(tgt)) + return 0; + + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += shiftlen; + fragfrom->size -= shiftlen; + fragfrom->page_offset += shiftlen; + + goto onlymerged; + } + + from++; + } + + /* Skip full, not-fitting skb to avoid expensive operations */ + if ((shiftlen == skb->len) && + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) + return 0; + + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) + return 0; + + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { + if (to == MAX_SKB_FRAGS) + return 0; + + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[to]; + + if (todo >= fragfrom->size) { + *fragto = *fragfrom; + todo -= fragfrom->size; + from++; + to++; + + } else { + get_page(fragfrom->page); + fragto->page = fragfrom->page; + fragto->page_offset = fragfrom->page_offset; + fragto->size = todo; + + fragfrom->page_offset += todo; + fragfrom->size -= todo; + todo = 0; + + to++; + break; + } + } + + /* Ready to "commit" this state change to tgt */ + skb_shinfo(tgt)->nr_frags = to; + + if (merge >= 0) { + fragfrom = &skb_shinfo(skb)->frags[0]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += fragfrom->size; + put_page(fragfrom->page); + } + + /* Reposition in the original skb */ + to = 0; + while (from < skb_shinfo(skb)->nr_frags) + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; + skb_shinfo(skb)->nr_frags = to; + + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); + +onlymerged: + /* Most likely the tgt won't ever need its checksum anymore, skb on + * the other hand might need it if it needs to be resent + */ + tgt->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + + /* Yak, is it really working this way? Some helper please? */ + skb->len -= shiftlen; + skb->data_len -= shiftlen; + skb->truesize -= shiftlen; + tgt->len += shiftlen; + tgt->data_len += shiftlen; + tgt->truesize += shiftlen; + + return shiftlen; +} + /** * skb_prepare_seq_read - Prepare a sequential read of skb data * @skb: the buffer to read -- cgit v1.2.3 From 0ace285605314c54339710484b54814945a60df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:30:21 -0800 Subject: tcp: handle shift/merge of cloned skbs too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This caused me to get repeatably: tcpdump: pcap_loop: recvfrom: Bad address Happens occassionally when I tcpdump my for-looped test xfers: while [ : ]; do echo -n "$(date '+%s.%N') "; ./sendfile; sleep 20; done Rest of the relevant commands: ethtool -K eth0 tso off tc qdisc add dev eth0 root netem drop 4% tcpdump -n -s0 -i eth0 -w sacklog.all Running net-next under kvm, connection goes to the same host (basically just out of kvm). The connection itself works ok and data gets sent without corruption even with a large number of tests while tcpdump fails usually within less than 5 tests. Whether it only happens because of this change or not, I don't know for sure but it's the only thing with which I've seen that error. The non-cloned variant works w/o it for much longer time. I'm yet to debug where the error actually comes from. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 844b8abeb18c..57555a4525da 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2018,13 +2018,10 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_no_header(skb, skb1, len, pos); } -/* Shifting from/to a cloned skb is a no-go. - * - * TODO: handle cloned skbs by using pskb_expand_head() - */ +/* Shifting from/to a cloned skb is a no-go. */ static int skb_prepare_for_shift(struct sk_buff *skb) { - return skb_cloned(skb); + return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } /** -- cgit v1.2.3 From 9f782db3f5ceee9aa8de6f853969fbec1b8c6e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 25 Nov 2008 13:57:01 -0800 Subject: tcp: skb_shift cannot cache frag ptrs past pskb_expand_head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since pskb_expand_head creates copy of the shared area we cannot keep any frag ptr past de-cloning. This fixes the tcpdump recvfrom -EFAULT problem. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 57555a4525da..e03d77d4c1c9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2018,7 +2018,10 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_no_header(skb, skb1, len, pos); } -/* Shifting from/to a cloned skb is a no-go. */ +/* Shifting from/to a cloned skb is a no-go. + * + * Caller cannot keep skb_shinfo related pointers past calling here! + */ static int skb_prepare_for_shift(struct sk_buff *skb) { return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); @@ -2070,6 +2073,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) skb_prepare_for_shift(tgt)) return 0; + /* All previous frag pointers might be stale! */ + fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[merge]; fragto->size += shiftlen; -- cgit v1.2.3 From 89319d3801d1d3ac29c7df1f067038986f267d29 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:26:06 -0800 Subject: net: Add frag_list support to skb_segment This patch adds limited support for handling frag_list packets in skb_segment. The intention is to support GRO (Generic Receive Offload) packets which will be constructed by chaining normal packets using frag_list. As such we require all frag_list members terminate on exact MSS boundaries. This is checked using BUG_ON. As there should only be one producer in the kernel of such packets, namely GRO, this requirement should not be difficult to maintain. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 73 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 14 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b1f628741f4c..18e224af05a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2428,6 +2428,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; + struct sk_buff *fskb = skb_shinfo(skb)->frag_list; unsigned int mss = skb_shinfo(skb)->gso_size; unsigned int doffset = skb->data - skb_mac_header(skb); unsigned int offset = doffset; @@ -2447,7 +2448,6 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) struct sk_buff *nskb; skb_frag_t *frag; int hsize; - int k; int size; len = skb->len - offset; @@ -2460,9 +2460,36 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) if (hsize > len || !sg) hsize = len; - nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC); - if (unlikely(!nskb)) - goto err; + if (!hsize && i >= nfrags) { + BUG_ON(fskb->len != len); + + pos += len; + nskb = skb_clone(fskb, GFP_ATOMIC); + fskb = fskb->next; + + if (unlikely(!nskb)) + goto err; + + hsize = skb_end_pointer(nskb) - nskb->head; + if (skb_cow_head(nskb, doffset + headroom)) { + kfree_skb(nskb); + goto err; + } + + nskb->truesize += skb_end_pointer(nskb) - nskb->head - + hsize; + skb_release_head_state(nskb); + __skb_push(nskb, doffset); + } else { + nskb = alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC); + + if (unlikely(!nskb)) + goto err; + + skb_reserve(nskb, headroom); + __skb_put(nskb, doffset); + } if (segs) tail->next = nskb; @@ -2473,13 +2500,15 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) __copy_skb_header(nskb, skb); nskb->mac_len = skb->mac_len; - skb_reserve(nskb, headroom); skb_reset_mac_header(nskb); skb_set_network_header(nskb, skb->mac_len); nskb->transport_header = (nskb->network_header + skb_network_header_len(skb)); - skb_copy_from_linear_data(skb, skb_put(nskb, doffset), - doffset); + skb_copy_from_linear_data(skb, nskb->data, doffset); + + if (pos >= offset + len) + continue; + if (!sg) { nskb->ip_summed = CHECKSUM_NONE; nskb->csum = skb_copy_and_csum_bits(skb, offset, @@ -2489,14 +2518,11 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) } frag = skb_shinfo(nskb)->frags; - k = 0; skb_copy_from_linear_data_offset(skb, offset, skb_put(nskb, hsize), hsize); - while (pos < offset + len) { - BUG_ON(i >= nfrags); - + while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; get_page(frag->page); size = frag->size; @@ -2506,20 +2532,39 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) frag->size -= offset - pos; } - k++; + skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; pos += size; } else { frag->size -= pos + size - (offset + len); - break; + goto skip_fraglist; } frag++; } - skb_shinfo(nskb)->nr_frags = k; + if (pos < offset + len) { + struct sk_buff *fskb2 = fskb; + + BUG_ON(pos + fskb->len != offset + len); + + pos += fskb->len; + fskb = fskb->next; + + if (fskb2->next) { + fskb2 = skb_clone(fskb2, GFP_ATOMIC); + if (!fskb2) + goto err; + } else + skb_get(fskb2); + + BUG_ON(skb_shinfo(nskb)->frag_list); + skb_shinfo(nskb)->frag_list = fskb2; + } + +skip_fraglist: nskb->data_len = len - hsize; nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; -- cgit v1.2.3 From 71d93b39e52e92aea35f1058d957cf12250d0b75 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:42:33 -0800 Subject: net: Add skb_gro_receive This patch adds the helper skb_gro_receive to merge packets for GRO. The current method is to allocate a new header skb and then chain the original packets to its frag_list. This is done to make it easier to integrate into the existing GSO framework. In future as GSO is moved into the drivers, we can undo this and simply chain the original packets together. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 18e224af05a6..b8d0abb26433 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2582,6 +2582,65 @@ err: EXPORT_SYMBOL_GPL(skb_segment); +int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff *p = *head; + struct sk_buff *nskb; + unsigned int headroom; + unsigned int hlen = p->data - skb_mac_header(p); + + if (hlen + p->len + skb->len >= 65536) + return -E2BIG; + + if (skb_shinfo(p)->frag_list) + goto merge; + + headroom = skb_headroom(p); + nskb = netdev_alloc_skb(p->dev, headroom); + if (unlikely(!nskb)) + return -ENOMEM; + + __copy_skb_header(nskb, p); + nskb->mac_len = p->mac_len; + + skb_reserve(nskb, headroom); + + skb_set_mac_header(nskb, -hlen); + skb_set_network_header(nskb, skb_network_offset(p)); + skb_set_transport_header(nskb, skb_transport_offset(p)); + + memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); + + *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); + skb_shinfo(nskb)->frag_list = p; + skb_header_release(p); + nskb->prev = p; + + nskb->data_len += p->len; + nskb->truesize += p->len; + nskb->len += p->len; + + *head = nskb; + nskb->next = p->next; + p->next = NULL; + + p = nskb; + +merge: + NAPI_GRO_CB(p)->count++; + p->prev->next = skb; + p->prev = skb; + skb_header_release(skb); + + p->data_len += skb->len; + p->truesize += skb->len; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive); + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", -- cgit v1.2.3