summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/act_mirred.c2
-rw-r--r--net/sched/sch_cbq.c2
-rw-r--r--net/sched/sch_drr.c2
-rw-r--r--net/sched/sch_generic.c44
-rw-r--r--net/sched/sch_hfsc.c2
-rw-r--r--net/sched/sch_htb.c261
-rw-r--r--net/sched/sch_netem.c111
-rw-r--r--net/sched/sch_qfq.c214
-rw-r--r--net/sched/sch_tbf.c47
9 files changed, 416 insertions, 269 deletions
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5d676edc22a6..977c10e0631b 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -243,7 +243,7 @@ nla_put_failure:
static int mirred_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tcf_mirred *m;
if (event == NETDEV_UNREGISTER)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 1bc210ffcba2..71a568862557 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -130,7 +130,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto *filter_list;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 759b308d1a8d..8302717ea303 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct list_head alist;
struct Qdisc *qdisc;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 20224086cc28..4626cef4b76e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -901,37 +901,33 @@ void dev_shutdown(struct net_device *dev)
void psched_ratecfg_precompute(struct psched_ratecfg *r,
const struct tc_ratespec *conf)
{
- u64 factor;
- u64 mult;
- int shift;
-
memset(r, 0, sizeof(*r));
r->overhead = conf->overhead;
- r->rate_bps = (u64)conf->rate << 3;
+ r->rate_bytes_ps = conf->rate;
r->mult = 1;
/*
- * Calibrate mult, shift so that token counting is accurate
- * for smallest packet size (64 bytes). Token (time in ns) is
- * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will
- * work as long as the smallest packet transfer time can be
- * accurately represented in nanosec.
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ * time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
*/
- if (r->rate_bps > 0) {
- /*
- * Higher shift gives better accuracy. Find the largest
- * shift such that mult fits in 32 bits.
- */
- for (shift = 0; shift < 16; shift++) {
- r->shift = shift;
- factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
- mult = div64_u64(factor, r->rate_bps);
- if (mult > UINT_MAX)
+ if (r->rate_bytes_ps > 0) {
+ u64 factor = NSEC_PER_SEC;
+
+ for (;;) {
+ r->mult = div64_u64(factor, r->rate_bytes_ps);
+ if (r->mult & (1U << 31) || factor & (1ULL << 63))
break;
+ factor <<= 1;
+ r->shift++;
}
-
- r->shift = shift - 1;
- factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
- r->mult = div64_u64(factor, r->rate_bps);
}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 9facea03faeb..c4075610502c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
unsigned int level; /* class level in hierarchy */
struct tcf_proto *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index adaedd79389c..c2124ea29f45 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -65,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f
module_param (htb_hysteresis, int, 0640);
MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+static int htb_rate_est = 0; /* htb classes have a default rate estimator */
+module_param(htb_rate_est, int, 0640);
+MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
+
/* used internaly to keep status of single class */
enum htb_cmode {
HTB_CANT_SEND, /* class can't send and can't borrow */
@@ -72,95 +76,105 @@ enum htb_cmode {
HTB_CAN_SEND /* class can send */
};
-/* interior & leaf nodes; props specific to leaves are marked L: */
+struct htb_prio {
+ union {
+ struct rb_root row;
+ struct rb_root feed;
+ };
+ struct rb_node *ptr;
+ /* When class changes from state 1->2 and disconnects from
+ * parent's feed then we lost ptr value and start from the
+ * first child again. Here we store classid of the
+ * last valid ptr (used when ptr is NULL).
+ */
+ u32 last_ptr_id;
+};
+
+/* interior & leaf nodes; props specific to leaves are marked L:
+ * To reduce false sharing, place mostly read fields at beginning,
+ * and mostly written ones at the end.
+ */
struct htb_class {
struct Qdisc_class_common common;
- /* general class parameters */
- struct gnet_stats_basic_packed bstats;
- struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
- struct tc_htb_xstats xstats; /* our special stats */
- int refcnt; /* usage count of this class */
+ struct psched_ratecfg rate;
+ struct psched_ratecfg ceil;
+ s64 buffer, cbuffer;/* token bucket depth/rate */
+ s64 mbuffer; /* max wait time */
+ int prio; /* these two are used only by leaves... */
+ int quantum; /* but stored for parent-to-leaf return */
- /* topology */
- int level; /* our level (see above) */
- unsigned int children;
- struct htb_class *parent; /* parent class */
+ struct tcf_proto *filter_list; /* class attached filters */
+ int filter_cnt;
+ int refcnt; /* usage count of this class */
- int prio; /* these two are used only by leaves... */
- int quantum; /* but stored for parent-to-leaf return */
+ int level; /* our level (see above) */
+ unsigned int children;
+ struct htb_class *parent; /* parent class */
+
+ struct gnet_stats_rate_est64 rate_est;
+
+ /*
+ * Written often fields
+ */
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct tc_htb_xstats xstats; /* our special stats */
+
+ /* token bucket parameters */
+ s64 tokens, ctokens;/* current number of tokens */
+ s64 t_c; /* checkpoint time */
union {
struct htb_class_leaf {
- struct Qdisc *q;
- int deficit[TC_HTB_MAXDEPTH];
struct list_head drop_list;
+ int deficit[TC_HTB_MAXDEPTH];
+ struct Qdisc *q;
} leaf;
struct htb_class_inner {
- struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
- struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
- /* When class changes from state 1->2 and disconnects from
- * parent's feed then we lost ptr value and start from the
- * first child again. Here we store classid of the
- * last valid ptr (used when ptr is NULL).
- */
- u32 last_ptr_id[TC_HTB_NUMPRIO];
+ struct htb_prio clprio[TC_HTB_NUMPRIO];
} inner;
} un;
- struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
- struct rb_node pq_node; /* node for event queue */
- s64 pq_key;
+ s64 pq_key;
- int prio_activity; /* for which prios are we active */
- enum htb_cmode cmode; /* current mode of the class */
-
- /* class attached filters */
- struct tcf_proto *filter_list;
- int filter_cnt;
+ int prio_activity; /* for which prios are we active */
+ enum htb_cmode cmode; /* current mode of the class */
+ struct rb_node pq_node; /* node for event queue */
+ struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+};
- /* token bucket parameters */
- struct psched_ratecfg rate;
- struct psched_ratecfg ceil;
- s64 buffer, cbuffer; /* token bucket depth/rate */
- s64 mbuffer; /* max wait time */
- s64 tokens, ctokens; /* current number of tokens */
- s64 t_c; /* checkpoint time */
+struct htb_level {
+ struct rb_root wait_pq;
+ struct htb_prio hprio[TC_HTB_NUMPRIO];
};
struct htb_sched {
struct Qdisc_class_hash clhash;
- struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
-
- /* self list - roots of self generating tree */
- struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
- int row_mask[TC_HTB_MAXDEPTH];
- struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
- u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+ int defcls; /* class where unclassified flows go to */
+ int rate2quantum; /* quant = rate / rate2quantum */
- /* self wait list - roots of wait PQs per row */
- struct rb_root wait_pq[TC_HTB_MAXDEPTH];
+ /* filters for qdisc itself */
+ struct tcf_proto *filter_list;
- /* time of nearest event per level (row) */
- s64 near_ev_cache[TC_HTB_MAXDEPTH];
+#define HTB_WARN_TOOMANYEVENTS 0x1
+ unsigned int warned; /* only one warning */
+ int direct_qlen;
+ struct work_struct work;
- int defcls; /* class where unclassified flows go to */
+ /* non shaped skbs; let them go directly thru */
+ struct sk_buff_head direct_queue;
+ long direct_pkts;
- /* filters for qdisc itself */
- struct tcf_proto *filter_list;
+ struct qdisc_watchdog watchdog;
- int rate2quantum; /* quant = rate / rate2quantum */
- s64 now; /* cached dequeue time */
- struct qdisc_watchdog watchdog;
+ s64 now; /* cached dequeue time */
+ struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
- /* non shaped skbs; let them go directly thru */
- struct sk_buff_head direct_queue;
- int direct_qlen; /* max qlen of above */
+ /* time of nearest event per level (row) */
+ s64 near_ev_cache[TC_HTB_MAXDEPTH];
- long direct_pkts;
+ int row_mask[TC_HTB_MAXDEPTH];
-#define HTB_WARN_TOOMANYEVENTS 0x1
- unsigned int warned; /* only one warning */
- struct work_struct work;
+ struct htb_level hlevel[TC_HTB_MAXDEPTH];
};
/* find class in global hash table using given handle */
@@ -276,7 +290,7 @@ static void htb_add_to_id_tree(struct rb_root *root,
static void htb_add_to_wait_tree(struct htb_sched *q,
struct htb_class *cl, s64 delay)
{
- struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
+ struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
cl->pq_key = q->now + delay;
if (cl->pq_key == q->now)
@@ -296,7 +310,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q,
p = &parent->rb_left;
}
rb_link_node(&cl->pq_node, parent, p);
- rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
+ rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
}
/**
@@ -323,7 +337,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q,
while (mask) {
int prio = ffz(~mask);
mask &= ~(1 << prio);
- htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
+ htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
}
}
@@ -349,16 +363,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q,
struct htb_class *cl, int mask)
{
int m = 0;
+ struct htb_level *hlevel = &q->hlevel[cl->level];
while (mask) {
int prio = ffz(~mask);
+ struct htb_prio *hprio = &hlevel->hprio[prio];
mask &= ~(1 << prio);
- if (q->ptr[cl->level][prio] == cl->node + prio)
- htb_next_rb_node(q->ptr[cl->level] + prio);
+ if (hprio->ptr == cl->node + prio)
+ htb_next_rb_node(&hprio->ptr);
- htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
- if (!q->row[cl->level][prio].rb_node)
+ htb_safe_rb_erase(cl->node + prio, &hprio->row);
+ if (!hprio->row.rb_node)
m |= 1 << prio;
}
q->row_mask[cl->level] &= ~m;
@@ -382,13 +398,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.feed[prio].rb_node)
+ if (p->un.inner.clprio[prio].feed.rb_node)
/* parent already has its feed in use so that
* reset bit in mask as parent is already ok
*/
mask &= ~(1 << prio);
- htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
+ htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
}
p->prio_activity |= mask;
cl = p;
@@ -418,18 +434,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.ptr[prio] == cl->node + prio) {
+ if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
/* we are removing child which is pointed to from
* parent feed - forget the pointer but remember
* classid
*/
- p->un.inner.last_ptr_id[prio] = cl->common.classid;
- p->un.inner.ptr[prio] = NULL;
+ p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
+ p->un.inner.clprio[prio].ptr = NULL;
}
- htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
+ htb_safe_rb_erase(cl->node + prio,
+ &p->un.inner.clprio[prio].feed);
- if (!p->un.inner.feed[prio].rb_node)
+ if (!p->un.inner.clprio[prio].feed.rb_node)
mask |= 1 << prio;
}
@@ -644,7 +661,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
htb_change_class_mode(q, cl, &diff);
if (old_mode != cl->cmode) {
if (old_mode != HTB_CAN_SEND)
- htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+ htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
if (cl->cmode != HTB_CAN_SEND)
htb_add_to_wait_tree(q, cl, diff);
}
@@ -664,7 +681,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
* next pending event (0 for no event in pq, q->now for too many events).
* Note: Applied are events whose have cl->pq_key <= q->now.
*/
-static s64 htb_do_events(struct htb_sched *q, int level,
+static s64 htb_do_events(struct htb_sched *q, const int level,
unsigned long start)
{
/* don't run for longer than 2 jiffies; 2 is used instead of
@@ -672,10 +689,12 @@ static s64 htb_do_events(struct htb_sched *q, int level,
* too soon
*/
unsigned long stop_at = start + 2;
+ struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
+
while (time_before(jiffies, stop_at)) {
struct htb_class *cl;
s64 diff;
- struct rb_node *p = rb_first(&q->wait_pq[level]);
+ struct rb_node *p = rb_first(wait_pq);
if (!p)
return 0;
@@ -684,7 +703,7 @@ static s64 htb_do_events(struct htb_sched *q, int level,
if (cl->pq_key > q->now)
return cl->pq_key;
- htb_safe_rb_erase(p, q->wait_pq + level);
+ htb_safe_rb_erase(p, wait_pq);
diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
htb_change_class_mode(q, cl, &diff);
if (cl->cmode != HTB_CAN_SEND)
@@ -728,8 +747,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
*
* Find leaf where current feed pointers points to.
*/
-static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
- struct rb_node **pptr, u32 * pid)
+static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
{
int i;
struct {
@@ -738,10 +756,10 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
u32 *pid;
} stk[TC_HTB_MAXDEPTH], *sp = stk;
- BUG_ON(!tree->rb_node);
- sp->root = tree->rb_node;
- sp->pptr = pptr;
- sp->pid = pid;
+ BUG_ON(!hprio->row.rb_node);
+ sp->root = hprio->row.rb_node;
+ sp->pptr = &hprio->ptr;
+ sp->pid = &hprio->last_ptr_id;
for (i = 0; i < 65535; i++) {
if (!*sp->pptr && *sp->pid) {
@@ -768,12 +786,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
}
} else {
struct htb_class *cl;
+ struct htb_prio *clp;
+
cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
if (!cl->level)
return cl;
- (++sp)->root = cl->un.inner.feed[prio].rb_node;
- sp->pptr = cl->un.inner.ptr + prio;
- sp->pid = cl->un.inner.last_ptr_id + prio;
+ clp = &cl->un.inner.clprio[prio];
+ (++sp)->root = clp->feed.rb_node;
+ sp->pptr = &clp->ptr;
+ sp->pid = &clp->last_ptr_id;
}
}
WARN_ON(1);
@@ -783,15 +804,16 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
/* dequeues packet at given priority and level; call only if
* you are sure that there is active class at prio/level
*/
-static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
- int level)
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
+ const int level)
{
struct sk_buff *skb = NULL;
struct htb_class *cl, *start;
+ struct htb_level *hlevel = &q->hlevel[level];
+ struct htb_prio *hprio = &hlevel->hprio[prio];
+
/* look initial class up in the row */
- start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
- q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ start = cl = htb_lookup_leaf(hprio, prio);
do {
next:
@@ -811,9 +833,7 @@ next:
if ((q->row_mask[level] & (1 << prio)) == 0)
return NULL;
- next = htb_lookup_leaf(q->row[level] + prio,
- prio, q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ next = htb_lookup_leaf(hprio, prio);
if (cl == start) /* fix start if we just deleted it */
start = next;
@@ -826,11 +846,9 @@ next:
break;
qdisc_warn_nonwc("htb", cl->un.leaf.q);
- htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
- ptr[0]) + prio);
- cl = htb_lookup_leaf(q->row[level] + prio, prio,
- q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+ &q->hlevel[0].hprio[prio].ptr);
+ cl = htb_lookup_leaf(hprio, prio);
} while (cl != start);
@@ -839,8 +857,8 @@ next:
cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
if (cl->un.leaf.deficit[level] < 0) {
cl->un.leaf.deficit[level] += cl->quantum;
- htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
- ptr[0]) + prio);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+ &q->hlevel[0].hprio[prio].ptr);
}
/* this used to be after charge_class but this constelation
* gives us slightly better performance
@@ -880,15 +898,14 @@ ok:
for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
/* common case optimization - skip event handler quickly */
int m;
- s64 event;
+ s64 event = q->near_ev_cache[level];
- if (q->now >= q->near_ev_cache[level]) {
+ if (q->now >= event) {
event = htb_do_events(q, level, start_at);
if (!event)
event = q->now + NSEC_PER_SEC;
q->near_ev_cache[level] = event;
- } else
- event = q->near_ev_cache[level];
+ }
if (next_event > event)
next_event = event;
@@ -968,10 +985,8 @@ static void htb_reset(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
__skb_queue_purge(&q->direct_queue);
sch->q.qlen = 0;
- memset(q->row, 0, sizeof(q->row));
+ memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
- memset(q->wait_pq, 0, sizeof(q->wait_pq));
- memset(q->ptr, 0, sizeof(q->ptr));
for (i = 0; i < TC_HTB_NUMPRIO; i++)
INIT_LIST_HEAD(q->drops + i);
}
@@ -1192,7 +1207,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
if (parent->cmode != HTB_CAN_SEND)
- htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
+ htb_safe_rb_erase(&parent->pq_node,
+ &q->hlevel[parent->level].wait_pq);
parent->level = 0;
memset(&parent->un.inner, 0, sizeof(parent->un.inner));
@@ -1281,7 +1297,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
htb_deactivate(q, cl);
if (cl->cmode != HTB_CAN_SEND)
- htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+ htb_safe_rb_erase(&cl->pq_node,
+ &q->hlevel[cl->level].wait_pq);
if (last_child)
htb_parent_to_leaf(q, cl, new_q);
@@ -1366,12 +1383,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (!cl)
goto failure;
- err = gen_new_estimator(&cl->bstats, &cl->rate_est,
- qdisc_root_sleeping_lock(sch),
- tca[TCA_RATE] ? : &est.nla);
- if (err) {
- kfree(cl);
- goto failure;
+ if (htb_rate_est || tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE] ? : &est.nla);
+ if (err) {
+ kfree(cl);
+ goto failure;
+ }
}
cl->refcnt = 1;
@@ -1401,7 +1420,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* remove from evt list because of level change */
if (parent->cmode != HTB_CAN_SEND) {
- htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
+ htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
parent->cmode = HTB_CAN_SEND;
}
parent->level = (parent->parent ? parent->parent->level
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 3d2acc7a9c80..82f6016d89ab 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -23,6 +23,7 @@
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -68,7 +69,8 @@
*/
struct netem_sched_data {
- /* internal t(ime)fifo qdisc uses sch->q and sch->limit */
+ /* internal t(ime)fifo qdisc uses t_root and sch->limit */
+ struct rb_root t_root;
/* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
@@ -128,10 +130,35 @@ struct netem_sched_data {
*/
struct netem_skb_cb {
psched_time_t time_to_send;
+ ktime_t tstamp_save;
};
+/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
+ * to hold a rb_node structure.
+ *
+ * If struct sk_buff layout is changed, the following checks will complain.
+ */
+static struct rb_node *netem_rb_node(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
+ BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
+ offsetof(struct sk_buff, next) + sizeof(skb->next));
+ BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
+ offsetof(struct sk_buff, prev) + sizeof(skb->prev));
+ BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
+ sizeof(skb->prev) +
+ sizeof(skb->tstamp));
+ return (struct rb_node *)&skb->next;
+}
+
+static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
+{
+ return (struct sk_buff *)rb;
+}
+
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
{
+ /* we assume we can use skb next/prev/tstamp as storage for rb_node */
qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
}
@@ -333,20 +360,23 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
{
- struct sk_buff_head *list = &sch->q;
+ struct netem_sched_data *q = qdisc_priv(sch);
psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
- struct sk_buff *skb = skb_peek_tail(list);
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
- /* Optimize for add at tail */
- if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
- return __skb_queue_tail(list, nskb);
+ while (*p) {
+ struct sk_buff *skb;
- skb_queue_reverse_walk(list, skb) {
+ parent = *p;
+ skb = netem_rb_to_skb(parent);
if (tnext >= netem_skb_cb(skb)->time_to_send)
- break;
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
}
-
- __skb_queue_after(list, skb, nskb);
+ rb_link_node(netem_rb_node(nskb), parent, p);
+ rb_insert_color(netem_rb_node(nskb), &q->t_root);
+ sch->q.qlen++;
}
/*
@@ -436,23 +466,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
now = psched_get_time();
if (q->rate) {
- struct sk_buff_head *list = &sch->q;
+ struct sk_buff *last;
- if (!skb_queue_empty(list)) {
+ if (!skb_queue_empty(&sch->q))
+ last = skb_peek_tail(&sch->q);
+ else
+ last = netem_rb_to_skb(rb_last(&q->t_root));
+ if (last) {
/*
* Last packet in queue is reference point (now),
* calculate this time bonus and subtract
* from delay.
*/
- delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now;
+ delay -= netem_skb_cb(last)->time_to_send - now;
delay = max_t(psched_tdiff_t, 0, delay);
- now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
+ now = netem_skb_cb(last)->time_to_send;
}
delay += packet_len_2_sched_time(skb->len, q);
}
cb->time_to_send = now + delay;
+ cb->tstamp_save = skb->tstamp;
++q->counter;
tfifo_enqueue(skb, sch);
} else {
@@ -476,6 +511,21 @@ static unsigned int netem_drop(struct Qdisc *sch)
unsigned int len;
len = qdisc_queue_drop(sch);
+
+ if (!len) {
+ struct rb_node *p = rb_first(&q->t_root);
+
+ if (p) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ len = qdisc_pkt_len(skb);
+ kfree_skb(skb);
+ }
+ }
if (!len && q->qdisc && q->qdisc->ops->drop)
len = q->qdisc->ops->drop(q->qdisc);
if (len)
@@ -488,19 +538,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
+ struct rb_node *p;
if (qdisc_is_throttled(sch))
return NULL;
tfifo_dequeue:
- skb = qdisc_peek_head(sch);
+ skb = __skb_dequeue(&sch->q);
if (skb) {
- const struct netem_skb_cb *cb = netem_skb_cb(skb);
+deliver:
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+ p = rb_first(&q->t_root);
+ if (p) {
+ psched_time_t time_to_send;
+
+ skb = netem_rb_to_skb(p);
/* if more time remaining? */
- if (cb->time_to_send <= psched_get_time()) {
- __skb_unlink(skb, &sch->q);
- sch->qstats.backlog -= qdisc_pkt_len(skb);
+ time_to_send = netem_skb_cb(skb)->time_to_send;
+ if (time_to_send <= psched_get_time()) {
+ rb_erase(p, &q->t_root);
+
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->tstamp = netem_skb_cb(skb)->tstamp_save;
#ifdef CONFIG_NET_CLS_ACT
/*
@@ -522,10 +588,7 @@ tfifo_dequeue:
}
goto tfifo_dequeue;
}
-deliver:
- qdisc_unthrottled(sch);
- qdisc_bstats_update(sch, skb);
- return skb;
+ goto deliver;
}
if (q->qdisc) {
@@ -533,7 +596,7 @@ deliver:
if (skb)
goto deliver;
}
- qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
+ qdisc_watchdog_schedule(&q->watchdog, time_to_send);
}
if (q->qdisc) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index d51852bba01c..8056fb4e618a 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -113,7 +113,6 @@
#define FRAC_BITS 30 /* fixed point arithmetic */
#define ONE_FP (1UL << FRAC_BITS)
-#define IWSUM (ONE_FP/QFQ_MAX_WSUM)
#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
@@ -138,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -189,6 +188,7 @@ struct qfq_sched {
struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
u32 num_active_agg; /* Num. of active aggregates */
u32 wsum; /* weight sum */
+ u32 iwsum; /* inverse weight sum */
unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
@@ -314,6 +314,7 @@ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
q->wsum +=
(int) agg->class_weight * (new_num_classes - agg->num_classes);
+ q->iwsum = ONE_FP / q->wsum;
agg->num_classes = new_num_classes;
}
@@ -340,6 +341,10 @@ static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
{
if (!hlist_unhashed(&agg->nonfull_next))
hlist_del_init(&agg->nonfull_next);
+ q->wsum -= agg->class_weight;
+ if (q->wsum != 0)
+ q->iwsum = ONE_FP / q->wsum;
+
if (q->in_serv_agg == agg)
q->in_serv_agg = qfq_choose_next_agg(q);
kfree(agg);
@@ -821,44 +826,73 @@ static void qfq_make_eligible(struct qfq_sched *q)
unsigned long old_vslot = q->oldV >> q->min_slot_shift;
if (vslot != old_vslot) {
- unsigned long mask = (1ULL << fls(vslot ^ old_vslot)) - 1;
+ unsigned long mask;
+ int last_flip_pos = fls(vslot ^ old_vslot);
+
+ if (last_flip_pos > 31) /* higher than the number of groups */
+ mask = ~0UL; /* make all groups eligible */
+ else
+ mask = (1UL << last_flip_pos) - 1;
+
qfq_move_groups(q, mask, IR, ER);
qfq_move_groups(q, mask, IB, EB);
}
}
-
/*
- * The index of the slot in which the aggregate is to be inserted must
- * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1'
- * because the start time of the group may be moved backward by one
- * slot after the aggregate has been inserted, and this would cause
- * non-empty slots to be right-shifted by one position.
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
*
- * If the weight and lmax (max_pkt_size) of the classes do not change,
- * then QFQ+ does meet the above contraint according to the current
- * values of its parameters. In fact, if the weight and lmax of the
- * classes do not change, then, from the theory, QFQ+ guarantees that
- * the slot index is never higher than
- * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
- * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
*
- * When the weight of a class is increased or the lmax of the class is
- * decreased, a new aggregate with smaller slot size than the original
- * parent aggregate of the class may happen to be activated. The
- * activation of this aggregate should be properly delayed to when the
- * service of the class has finished in the ideal system tracked by
- * QFQ+. If the activation of the aggregate is not delayed to this
- * reference time instant, then this aggregate may be unjustly served
- * before other aggregates waiting for service. This may cause the
- * above bound to the slot index to be violated for some of these
- * unlucky aggregates.
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated. In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
*
* Instead of delaying the activation of the new aggregate, which is
- * quite complex, the following inaccurate but simple solution is used:
- * if the slot index is higher than QFQ_MAX_SLOTS-2, then the
- * timestamps of the aggregate are shifted backward so as to let the
- * slot index become equal to QFQ_MAX_SLOTS-2.
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
*/
static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
u64 roundedS)
@@ -1003,9 +1037,61 @@ static inline void charge_actual_service(struct qfq_aggregate *agg)
agg->F = agg->S + (u64)service_received * agg->inv_w;
}
-static inline void qfq_update_agg_ts(struct qfq_sched *q,
- struct qfq_aggregate *agg,
- enum update_reason reason);
+/* Assign a reasonable start time for a new aggregate in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in EB (see [2]). So, if we have groups in ER,
+ * set S to the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ unsigned long mask;
+ u64 limit, roundedF;
+ int slot_shift = agg->grp->slot_shift;
+
+ roundedF = qfq_round_down(agg->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
+
+ if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], agg->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ if (qfq_gt(limit, next->F))
+ agg->S = next->F;
+ else /* preserve timestamp correctness */
+ agg->S = limit;
+ return;
+ }
+ }
+ agg->S = q->V;
+ } else /* timestamp is not stale */
+ agg->S = agg->F;
+}
+
+/* Update the timestamps of agg before scheduling/rescheduling it for
+ * service. In particular, assign to agg->F its maximum possible
+ * value, i.e., the virtual finish time with which the aggregate
+ * should be labeled if it used all its budget once in service.
+ */
+static inline void
+qfq_update_agg_ts(struct qfq_sched *q,
+ struct qfq_aggregate *agg, enum update_reason reason)
+{
+ if (reason != requeue)
+ qfq_update_start(q, agg);
+ else /* just charge agg for the service received */
+ agg->S = agg->F;
+
+ agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
+}
static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
@@ -1077,7 +1163,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
else
in_serv_agg->budget -= len;
- q->V += (u64)len * IWSUM;
+ q->V += (u64)len * q->iwsum;
pr_debug("qfq dequeue: len %u F %lld now %lld\n",
len, (unsigned long long) in_serv_agg->F,
(unsigned long long) q->V);
@@ -1128,66 +1214,6 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
return agg;
}
-/*
- * Assign a reasonable start time for a new aggregate in group i.
- * Admissible values for \hat(F) are multiples of \sigma_i
- * no greater than V+\sigma_i . Larger values mean that
- * we had a wraparound so we consider the timestamp to be stale.
- *
- * If F is not stale and F >= V then we set S = F.
- * Otherwise we should assign S = V, but this may violate
- * the ordering in EB (see [2]). So, if we have groups in ER,
- * set S to the F_j of the first group j which would be blocking us.
- * We are guaranteed not to move S backward because
- * otherwise our group i would still be blocked.
- */
-static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
-{
- unsigned long mask;
- u64 limit, roundedF;
- int slot_shift = agg->grp->slot_shift;
-
- roundedF = qfq_round_down(agg->F, slot_shift);
- limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
-
- if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
- /* timestamp was stale */
- mask = mask_from(q->bitmaps[ER], agg->grp->index);
- if (mask) {
- struct qfq_group *next = qfq_ffs(q, mask);
- if (qfq_gt(roundedF, next->F)) {
- if (qfq_gt(limit, next->F))
- agg->S = next->F;
- else /* preserve timestamp correctness */
- agg->S = limit;
- return;
- }
- }
- agg->S = q->V;
- } else /* timestamp is not stale */
- agg->S = agg->F;
-}
-
-/*
- * Update the timestamps of agg before scheduling/rescheduling it for
- * service. In particular, assign to agg->F its maximum possible
- * value, i.e., the virtual finish time with which the aggregate
- * should be labeled if it used all its budget once in service.
- */
-static inline void
-qfq_update_agg_ts(struct qfq_sched *q,
- struct qfq_aggregate *agg, enum update_reason reason)
-{
- if (reason != requeue)
- qfq_update_start(q, agg);
- else /* just charge agg for the service received */
- agg->S = agg->F;
-
- agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
-}
-
-static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *);
-
static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct qfq_sched *q = qdisc_priv(sch);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index e478d316602b..1aaf1b6e51a2 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -116,14 +116,57 @@ struct tbf_sched_data {
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ int ret, nb;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_reshape_fail(skb, sch);
+
+ nb = 0;
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ if (likely(segs->len <= q->max_size)) {
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ ret = qdisc_enqueue(segs, q->qdisc);
+ } else {
+ ret = qdisc_reshape_fail(skb, sch);
+ }
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ sch->qstats.drops++;
+ } else {
+ nb++;
+ }
+ segs = nskb;
+ }
+ sch->q.qlen += nb;
+ if (nb > 1)
+ qdisc_tree_decrease_qlen(sch, 1 - nb);
+ consume_skb(skb);
+ return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
- if (qdisc_pkt_len(skb) > q->max_size)
+ if (qdisc_pkt_len(skb) > q->max_size) {
+ if (skb_is_gso(skb))
+ return tbf_segment(skb, sch);
return qdisc_reshape_fail(skb, sch);
-
+ }
ret = qdisc_enqueue(skb, q->qdisc);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))