From 7024339a1cfa0d5bbfa628a7a35b67789cc86d7f Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 16 Jun 2020 22:25:20 +0200 Subject: net/sched: act_gate: fix NULL dereference in tcf_gate_init() it is possible to see a KASAN use-after-free, immediately followed by a NULL dereference crash, with the following command: # tc action add action gate index 3 cycle-time 100000000ns \ > cycle-time-ext 100000000ns clockid CLOCK_TAI BUG: KASAN: use-after-free in tcf_action_init_1+0x8eb/0x960 Write of size 1 at addr ffff88810a5908bc by task tc/883 CPU: 0 PID: 883 Comm: tc Not tainted 5.7.0+ #188 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 Call Trace: dump_stack+0x75/0xa0 print_address_description.constprop.6+0x1a/0x220 kasan_report.cold.9+0x37/0x7c tcf_action_init_1+0x8eb/0x960 tcf_action_init+0x157/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 [...] KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077] CPU: 0 PID: 883 Comm: tc Tainted: G B 5.7.0+ #188 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:tcf_action_fill_size+0xa3/0xf0 [....] RSP: 0018:ffff88813a48f250 EFLAGS: 00010212 RAX: dffffc0000000000 RBX: 0000000000000094 RCX: ffffffffa47c3eb6 RDX: 000000000000000e RSI: 0000000000000008 RDI: 0000000000000070 RBP: ffff88810a590800 R08: 0000000000000004 R09: ffffed1027491e03 R10: 0000000000000003 R11: ffffed1027491e03 R12: 0000000000000000 R13: 0000000000000000 R14: dffffc0000000000 R15: ffff88810a590800 FS: 00007f62cae8ce40(0000) GS:ffff888147c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f62c9d20a10 CR3: 000000013a52a000 CR4: 0000000000340ef0 Call Trace: tcf_action_init+0x172/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 this is caused by the test on 'cycletime_ext', that is still unassigned when the action is newly created. This makes the action .init() return 0 without calling tcf_idr_insert(), hence the UAF + crash. rework the logic that prevents zero values of cycle-time, as follows: 1) 'tcfg_cycletime_ext' seems to be unused in the action software path, and it was already possible by other means to obtain non-zero cycletime and zero cycletime-ext. So, removing that test should not cause any damage. 2) while at it, we must prevent overwriting configuration data with wrong ones: use a temporary variable for 'tcfg_cycletime', and validate it preserving the original semantic (that allowed computing the cycle time as the sum of all intervals, when not specified by TCA_GATE_CYCLE_TIME). 3) remove the test on 'tcfg_cycletime', no more useful, and avoid returning -EFAULT, which did not seem an appropriate return value for a wrong netlink attribute. v3: fix uninitialized 'cycletime' (thanks to Vladimir Oltean) v2: remove useless 'return;' at the end of void gate_get_start_time() Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") CC: Ivan Vecera Signed-off-by: Davide Caratti Acked-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/sched/act_gate.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 9c628591f452..94e560c2f81d 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -32,7 +32,7 @@ static ktime_t gate_get_time(struct tcf_gate *gact) return KTIME_MAX; } -static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) { struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; @@ -43,18 +43,13 @@ static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) if (ktime_after(base, now)) { *start = base; - return 0; + return; } cycle = param->tcfg_cycletime; - /* cycle time should not be zero */ - if (!cycle) - return -EFAULT; - n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); - return 0; } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) @@ -287,12 +282,12 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, enum tk_offsets tk_offset = TK_OFFS_TAI; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; + u64 cycletime = 0, basetime = 0; struct tcf_gate_params *p; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; int ret = 0, err; - u64 basetime = 0; u32 gflags = 0; s32 prio = -1; ktime_t start; @@ -375,11 +370,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, spin_lock_bh(&gact->tcf_lock); p = &gact->param; - if (tb[TCA_GATE_CYCLE_TIME]) { - p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); - if (!p->tcfg_cycletime_ext) - goto chain_put; - } + if (tb[TCA_GATE_CYCLE_TIME]) + cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); @@ -387,14 +379,19 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, goto chain_put; } - if (!p->tcfg_cycletime) { + if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - p->tcfg_cycletime = cycle; + cycletime = cycle; + if (!cycletime) { + err = -EINVAL; + goto chain_put; + } } + p->tcfg_cycletime = cycletime; if (tb[TCA_GATE_CYCLE_TIME_EXT]) p->tcfg_cycletime_ext = @@ -408,14 +405,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, gact->tk_offset = tk_offset; hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); gact->hitimer.function = gate_timer_func; - - err = gate_get_start_time(gact, &start); - if (err < 0) { - NL_SET_ERR_MSG(extack, - "Internal error: failed get start time"); - release_entry_list(&p->entries); - goto chain_put; - } + gate_get_start_time(gact, &start); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; -- cgit v1.2.3 From c362a06e96eae16ccb7b0f4f93f19224279b8610 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 16 Jun 2020 22:25:21 +0200 Subject: net/sched: act_gate: fix configuration of the periodic timer assigning a dummy value of 'clock_id' to avoid cancellation of the cycle timer before its initialization was a temporary solution, and we still need to handle the case where act_gate timer parameters are changed by commands like the following one: # tc action replace action gate the fix consists in the following items: 1) remove the workaround assignment of 'clock_id', and init the list of entries before the first error path after IDR atomic check/allocation 2) validate 'clock_id' earlier: there is no need to do IDR atomic check/allocation if we know that 'clock_id' is a bad value 3) use a dedicated function, 'gate_setup_timer()', to ensure that the timer is cancelled and re-initialized on action overwrite, and also ensure we initialize the timer in the error path of tcf_gate_init() v3: improve comment in the error path of tcf_gate_init() (thanks to Vladimir Oltean) v2: avoid 'goto' in gate_setup_timer (thanks to Cong Wang) CC: Ivan Vecera Fixes: a01c245438c5 ("net/sched: fix a couple of splats in the error path of tfc_gate_init()") Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") Signed-off-by: Davide Caratti Acked-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/sched/act_gate.c | 90 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 35 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 94e560c2f81d..323ae7f6315d 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -272,6 +272,27 @@ release_list: return err; } +static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, + enum tk_offsets tko, s32 clockid, + bool do_init) +{ + if (!do_init) { + if (basetime == gact->param.tcfg_basetime && + tko == gact->tk_offset && + clockid == gact->param.tcfg_clockid) + return; + + spin_unlock_bh(&gact->tcf_lock); + hrtimer_cancel(&gact->hitimer); + spin_lock_bh(&gact->tcf_lock); + } + gact->param.tcfg_basetime = basetime; + gact->param.tcfg_clockid = clockid; + gact->tk_offset = tko; + hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); + gact->hitimer.function = gate_timer_func; +} + static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, @@ -303,6 +324,27 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (!tb[TCA_GATE_PARMS]) return -EINVAL; + if (tb[TCA_GATE_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); + switch (clockid) { + case CLOCK_REALTIME: + tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + return -EINVAL; + } + } + parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; @@ -326,10 +368,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } - if (ret == ACT_P_CREATED) { - to_gate(*a)->param.tcfg_clockid = -1; - INIT_LIST_HEAD(&(to_gate(*a)->param.entries)); - } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -340,33 +378,14 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); - if (tb[TCA_GATE_CLOCKID]) { - clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); - switch (clockid) { - case CLOCK_REALTIME: - tk_offset = TK_OFFS_REAL; - break; - case CLOCK_MONOTONIC: - tk_offset = TK_OFFS_MAX; - break; - case CLOCK_BOOTTIME: - tk_offset = TK_OFFS_BOOT; - break; - case CLOCK_TAI: - tk_offset = TK_OFFS_TAI; - break; - default: - NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); - goto release_idr; - } - } + gact = to_gate(*a); + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&gact->param.entries); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; - gact = to_gate(*a); - spin_lock_bh(&gact->tcf_lock); p = &gact->param; @@ -397,14 +416,10 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, p->tcfg_cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); + gate_setup_timer(gact, basetime, tk_offset, clockid, + ret == ACT_P_CREATED); p->tcfg_priority = prio; - p->tcfg_basetime = basetime; - p->tcfg_clockid = clockid; p->tcfg_flags = gflags; - - gact->tk_offset = tk_offset; - hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); - gact->hitimer.function = gate_timer_func; gate_get_start_time(gact, &start); gact->current_close_time = start; @@ -433,6 +448,13 @@ chain_put: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: + /* action is not inserted in any list: it's safe to init hitimer + * without taking tcf_lock. + */ + if (ret == ACT_P_CREATED) + gate_setup_timer(gact, gact->param.tcfg_basetime, + gact->tk_offset, gact->param.tcfg_clockid, + true); tcf_idr_release(*a, bind); return err; } @@ -443,9 +465,7 @@ static void tcf_gate_cleanup(struct tc_action *a) struct tcf_gate_params *p; p = &gact->param; - if (p->tcfg_clockid != -1) - hrtimer_cancel(&gact->hitimer); - + hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } -- cgit v1.2.3 From a1db217861f33b8d9ea8171bcacee51186e2d5ba Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 18 Jun 2020 20:49:10 +0800 Subject: net: flow_offload: fix flow_indr_dev_unregister path If the representor is removed, then identify the indirect flow_blocks that need to be removed by the release callback and the port representor structure. To identify the port representor structure, a new indr.cb_priv field needs to be introduced. The flow_block also needs to be removed from the driver list from the cleanup path. Fixes: 1fac52da5942 ("net: flow_offload: consolidate indirect flow_block infrastructure") Signed-off-by: wenxu Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a00a203b2ef5..f8028d73edf1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -652,6 +652,7 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) &block->flow_block, tcf_block_shared(block), &extack); down_write(&block->cb_lock); + list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); up_write(&block->cb_lock); rtnl_lock(); -- cgit v1.2.3 From 3c005110d4083c52ff6c99018127adf92f3f23aa Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 18 Jun 2020 20:49:11 +0800 Subject: net/sched: cls_api: fix nooffloaddevcnt warning dmesg log The block->nooffloaddevcnt should always count for indr block. even the indr block offload successful. The representor maybe gone away and the ingress qdisc can work in software mode. block->nooffloaddevcnt warning with following dmesg log: [ 760.667058] ##################################################### [ 760.668186] ## TEST test-ecmp-add-vxlan-encap-disable-sriov.sh ## [ 760.669179] ##################################################### [ 761.780655] :test: Fedora 30 (Thirty) [ 761.783794] :test: Linux reg-r-vrt-018-180 5.7.0+ [ 761.822890] :test: NIC ens1f0 FW 16.26.6000 PCI 0000:81:00.0 DEVICE 0x1019 ConnectX-5 Ex [ 761.860244] mlx5_core 0000:81:00.0 ens1f0: Link up [ 761.880693] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready [ 762.059732] mlx5_core 0000:81:00.1 ens1f1: Link up [ 762.234341] :test: unbind vfs of ens1f0 [ 762.257825] :test: Change ens1f0 eswitch (0000:81:00.0) mode to switchdev [ 762.291363] :test: unbind vfs of ens1f1 [ 762.306914] :test: Change ens1f1 eswitch (0000:81:00.1) mode to switchdev [ 762.309237] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3) [ 763.282598] mlx5_core 0000:81:00.1: E-Switch: Supported tc offload range - chains: 4294967294, prios: 4294967295 [ 763.362825] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 763.444465] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0 [ 763.460088] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 763.502586] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 763.552429] ens1f1_0: renamed from eth0 [ 763.569569] mlx5_core 0000:81:00.1: E-Switch: Enable: mode(OFFLOADS), nvfs(2), active vports(3) [ 763.629694] ens1f1_1: renamed from eth1 [ 764.631552] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_0: link becomes ready [ 764.670841] :test: unbind vfs of ens1f0 [ 764.681966] :test: unbind vfs of ens1f1 [ 764.726762] mlx5_core 0000:81:00.0 ens1f0: Link up [ 764.766511] mlx5_core 0000:81:00.1 ens1f1: Link up [ 764.797325] :test: Add multipath vxlan encap rule and disable sriov [ 764.798544] :test: config multipath route [ 764.812732] mlx5_core 0000:81:00.0: lag map port 1:2 port 2:2 [ 764.874556] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:2 [ 765.603681] :test: OK [ 765.659048] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_1: link becomes ready [ 765.675085] :test: verify rule in hw [ 765.694237] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready [ 765.711892] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1: link becomes ready [ 766.979230] :test: OK [ 768.125419] :test: OK [ 768.127519] :test: - disable sriov ens1f1 [ 768.131160] pci 0000:81:02.2: Removing from iommu group 75 [ 768.132646] pci 0000:81:02.3: Removing from iommu group 76 [ 769.179749] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3) [ 769.455627] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:1 [ 769.703990] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0) [ 769.988637] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0 [ 769.990022] :test: - disable sriov ens1f0 [ 769.994922] pci 0000:81:00.2: Removing from iommu group 73 [ 769.997048] pci 0000:81:00.3: Removing from iommu group 74 [ 771.035813] mlx5_core 0000:81:00.0: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3) [ 771.339091] ------------[ cut here ]------------ [ 771.340812] WARNING: CPU: 6 PID: 3448 at net/sched/cls_api.c:749 tcf_block_offload_unbind.isra.0+0x5c/0x60 [ 771.341728] Modules linked in: act_mirred act_tunnel_key cls_flower dummy vxlan ip6_udp_tunnel udp_tunnel sch_ingress nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mlxfw act_ct nf_flow_table kvm_intel nf_nat kvm nf_conntrack irqbypass crct10dif_pclmul igb crc32_pclmul nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 crc32c_intel ghash_clmulni_intel ptp ipmi_ssif intel_cstate pps_c ore ses intel_uncore mei_me iTCO_wdt joydev ipmi_si iTCO_vendor_support i2c_i801 enclosure mei ioatdma dca lpc_ich wmi ipmi_devintf pcspkr acpi_power_meter ipmi_msghandler acpi_pad ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas [ 771.347818] CPU: 6 PID: 3448 Comm: test-ecmp-add-v Not tainted 5.7.0+ #1146 [ 771.348727] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 771.349646] RIP: 0010:tcf_block_offload_unbind.isra.0+0x5c/0x60 [ 771.350553] Code: 4a fd ff ff 83 f8 a1 74 0e 5b 4c 89 e7 5d 41 5c 41 5d e9 07 93 89 ff 8b 83 a0 00 00 00 8d 50 ff 89 93 a0 00 00 00 85 c0 75 df <0f> 0b eb db 0f 1f 44 00 00 41 57 41 56 41 55 41 89 cd 41 54 49 89 [ 771.352420] RSP: 0018:ffffb33144cd3b00 EFLAGS: 00010246 [ 771.353353] RAX: 0000000000000000 RBX: ffff8b37cf4b2800 RCX: 0000000000000000 [ 771.354294] RDX: 00000000ffffffff RSI: ffff8b3b9aad0000 RDI: ffffffff8d5c6e20 [ 771.355245] RBP: ffff8b37eb546948 R08: ffffffffc0b7a348 R09: ffff8b3b9aad0000 [ 771.356189] R10: 0000000000000001 R11: ffff8b3ba7a0a1c0 R12: ffff8b37cf4b2850 [ 771.357123] R13: ffff8b3b9aad0000 R14: ffff8b37cf4b2820 R15: ffff8b37cf4b2820 [ 771.358039] FS: 00007f8a19b6e740(0000) GS:ffff8b3befa00000(0000) knlGS:0000000000000000 [ 771.358965] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 771.359885] CR2: 00007f3afb91c1a0 CR3: 000000045133c004 CR4: 00000000001606e0 [ 771.360825] Call Trace: [ 771.361764] __tcf_block_put+0x84/0x150 [ 771.362712] ingress_destroy+0x1b/0x20 [sch_ingress] [ 771.363658] qdisc_destroy+0x3e/0xc0 [ 771.364594] dev_shutdown+0x7a/0xa5 [ 771.365522] rollback_registered_many+0x20d/0x530 [ 771.366458] ? netdev_upper_dev_unlink+0x15d/0x1c0 [ 771.367387] unregister_netdevice_many.part.0+0xf/0x70 [ 771.368310] vxlan_netdevice_event+0xa4/0x110 [vxlan] [ 771.369454] notifier_call_chain+0x4c/0x70 [ 771.370579] rollback_registered_many+0x2f5/0x530 [ 771.371719] rollback_registered+0x56/0x90 [ 771.372843] unregister_netdevice_queue+0x73/0xb0 [ 771.373982] unregister_netdev+0x18/0x20 [ 771.375168] mlx5e_vport_rep_unload+0x56/0xc0 [mlx5_core] [ 771.376327] esw_offloads_disable+0x81/0x90 [mlx5_core] [ 771.377512] mlx5_eswitch_disable_locked.cold+0xcb/0x1af [mlx5_core] [ 771.378679] mlx5_eswitch_disable+0x44/0x60 [mlx5_core] [ 771.379822] mlx5_device_disable_sriov+0xad/0xb0 [mlx5_core] [ 771.380968] mlx5_core_sriov_configure+0xc1/0xe0 [mlx5_core] [ 771.382087] sriov_numvfs_store+0xfc/0x130 [ 771.383195] kernfs_fop_write+0xce/0x1b0 [ 771.384302] vfs_write+0xb6/0x1a0 [ 771.385410] ksys_write+0x5f/0xe0 [ 771.386500] do_syscall_64+0x5b/0x1d0 [ 771.387569] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()") Signed-off-by: wenxu Signed-off-by: David S. Miller --- net/sched/cls_api.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f8028d73edf1..faa78b7dd962 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -672,25 +672,29 @@ static int tcf_block_offload_cmd(struct tcf_block *block, struct netlink_ext_ack *extack) { struct flow_block_offload bo = {}; - int err; tcf_block_offload_init(&bo, dev, command, ei->binder_type, &block->flow_block, tcf_block_shared(block), extack); - if (dev->netdev_ops->ndo_setup_tc) + if (dev->netdev_ops->ndo_setup_tc) { + int err; + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); - else - err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, - &bo, tc_block_indr_cleanup); + if (err < 0) { + if (err != -EOPNOTSUPP) + NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); + return err; + } - if (err < 0) { - if (err != -EOPNOTSUPP) - NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); - return err; + return tcf_block_setup(block, &bo); } - return tcf_block_setup(block, &bo); + flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, &bo, + tc_block_indr_cleanup); + tcf_block_setup(block, &bo); + + return -EOPNOTSUPP; } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, -- cgit v1.2.3 From 67c20de35a3cc2e2cd940f95ebd85ed0a765315a Mon Sep 17 00:00:00 2001 From: Rob Gill Date: Sat, 20 Jun 2020 02:08:25 +0000 Subject: net: Add MODULE_DESCRIPTION entries to network modules The user tool modinfo is used to get information on kernel modules, including a description where it is available. This patch adds a brief MODULE_DESCRIPTION to the following modules: 9p drop_monitor esp4_offload esp6_offload fou fou6 ila sch_fq sch_fq_codel sch_hhf Signed-off-by: Rob Gill Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 1 + net/sched/sch_fq_codel.c | 1 + net/sched/sch_hhf.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net/sched') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 8f06a808c59a..2fb76fc0cc31 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1075,3 +1075,4 @@ module_init(fq_module_init) module_exit(fq_module_exit) MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fair Queue Packet Scheduler"); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 436160be9c18..459a784056c0 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -721,3 +721,4 @@ module_init(fq_codel_module_init) module_exit(fq_codel_module_exit) MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fair Queue CoDel discipline"); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index be35f03b657b..420ede875322 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -721,3 +721,4 @@ module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)"); -- cgit v1.2.3 From 9208d2863ac689a563b92f2161d8d1e7127d0add Mon Sep 17 00:00:00 2001 From: Ilya Ponetayev Date: Thu, 25 Jun 2020 22:12:07 +0200 Subject: sch_cake: don't try to reallocate or unshare skb unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_handle_diffserv() tries to linearize mac and network header parts of skb and to make it writable unconditionally. In some cases it leads to full skb reallocation, which reduces throughput and increases CPU load. Some measurements of IPv4 forward + NAPT on MIPS router with 580 MHz single-core CPU was conducted. It appears that on kernel 4.9 skb_try_make_writable() reallocates skb, if skb was allocated in ethernet driver via so-called 'build skb' method from page cache (it was discovered by strange increase of kmalloc-2048 slab at first). Obtain DSCP value via read-only skb_header_pointer() call, and leave linearization only for DSCP bleaching or ECN CE setting. And, as an additional optimisation, skip diffserv parsing entirely if it is not needed by the current configuration. Fixes: c87b4ecdbe8d ("sch_cake: Make sure we can write the IP header before changing DSCP bits") Signed-off-by: Ilya Ponetayev [ fix a few style issues, reflow commit message ] Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 60f8ae578819..cae006bef565 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1553,30 +1553,49 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) { - int wlen = skb_network_offset(skb); + const int offset = skb_network_offset(skb); + u16 *buf, buf_; u8 dscp; switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): - wlen += sizeof(struct iphdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) + buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); + if (unlikely(!buf)) return 0; - dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; - if (wash && dscp) + /* ToS is in the second byte of iphdr */ + dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; + + if (wash && dscp) { + const int wlen = offset + sizeof(struct iphdr); + + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + } + return dscp; case htons(ETH_P_IPV6): - wlen += sizeof(struct ipv6hdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) + buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); + if (unlikely(!buf)) return 0; - dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; - if (wash && dscp) + /* Traffic class is in the first and second bytes of ipv6hdr */ + dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; + + if (wash && dscp) { + const int wlen = offset + sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + } + return dscp; case htons(ETH_P_ARP): -- cgit v1.2.3 From 8c95eca0bb8c4bd2231a0d581f1ad0d50c90488c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 25 Jun 2020 22:12:08 +0200 Subject: sch_cake: don't call diffserv parsing code when it is not needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a further optimisation of the diffserv parsing codepath, we can skip it entirely if CAKE is configured to neither use diffserv-based classification, nor to zero out the diffserv bits. Fixes: c87b4ecdbe8d ("sch_cake: Make sure we can write the IP header before changing DSCP bits") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index cae006bef565..094d6e652deb 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1551,7 +1551,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) return idx + (tin << 16); } -static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) +static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) { const int offset = skb_network_offset(skb); u16 *buf, buf_; @@ -1612,14 +1612,17 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, { struct cake_sched_data *q = qdisc_priv(sch); u32 tin, mark; + bool wash; u8 dscp; /* Tin selection: Default to diffserv-based selection, allow overriding - * using firewall marks or skb->priority. + * using firewall marks or skb->priority. Call DSCP parsing early if + * wash is enabled, otherwise defer to below to skip unneeded parsing. */ - dscp = cake_handle_diffserv(skb, - q->rate_flags & CAKE_FLAG_WASH); mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; + wash = !!(q->rate_flags & CAKE_FLAG_WASH); + if (wash) + dscp = cake_handle_diffserv(skb, wash); if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) tin = 0; @@ -1633,6 +1636,8 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; else { + if (!wash) + dscp = cake_handle_diffserv(skb, wash); tin = q->tin_index[dscp]; if (unlikely(tin >= q->tin_cnt)) -- cgit v1.2.3 From 3f608f0c41360b11b04c763f348b712f651c8bac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 25 Jun 2020 22:12:09 +0200 Subject: sch_cake: fix a few style nits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I spotted a few nits when comparing the in-tree version of sch_cake with the out-of-tree one: A redundant error variable declaration shadowing an outer declaration, and an indentation alignment issue. Fix both of these. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 094d6e652deb..ca813697728e 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2715,7 +2715,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); if (opt) { - int err = cake_change(sch, opt, extack); + err = cake_change(sch, opt, extack); if (err) return err; @@ -3032,7 +3032,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, - flow->cvars.blue_timer))); + flow->cvars.blue_timer))); } if (flow->cvars.dropping) { PUT_STAT_S32(DROP_NEXT_US, -- cgit v1.2.3