From 69c2fc891343cb5217c866d10709343cff190bdc Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 12:41:03 +0100 Subject: drm/i915: Remove the per-ring write list This is now handled by a global flag to ensure we emit a flush before the next serialisation point (if we failed to queue one previously). Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index bf0195a96d53..8f221d9a7bdb 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1002,7 +1002,6 @@ static int intel_init_ring_buffer(struct drm_device *dev, ring->dev = dev; INIT_LIST_HEAD(&ring->active_list); INIT_LIST_HEAD(&ring->request_list); - INIT_LIST_HEAD(&ring->gpu_write_list); ring->size = 32 * PAGE_SIZE; init_waitqueue_head(&ring->irq_queue); @@ -1473,7 +1472,6 @@ int intel_render_ring_init_dri(struct drm_device *dev, u64 start, u32 size) ring->dev = dev; INIT_LIST_HEAD(&ring->active_list); INIT_LIST_HEAD(&ring->request_list); - INIT_LIST_HEAD(&ring->gpu_write_list); ring->size = size; ring->effective_size = ring->size; -- cgit v1.2.3 From a7b9761d0a2ded58170ffb4d423ff3d7228103f4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 12:41:08 +0100 Subject: drm/i915: Split i915_gem_flush_ring() into seperate invalidate/flush funcs By moving the function to intel_ringbuffer and currying the appropriate parameter, hopefully we make the callsites easier to read and understand. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 8f221d9a7bdb..8b7085e4cf84 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1564,3 +1564,41 @@ int intel_init_blt_ring_buffer(struct drm_device *dev) return intel_init_ring_buffer(dev, ring); } + +int +intel_ring_flush_all_caches(struct intel_ring_buffer *ring) +{ + int ret; + + if (!ring->gpu_caches_dirty) + return 0; + + ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS); + if (ret) + return ret; + + trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS); + + ring->gpu_caches_dirty = false; + return 0; +} + +int +intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring) +{ + uint32_t flush_domains; + int ret; + + flush_domains = 0; + if (ring->gpu_caches_dirty) + flush_domains = I915_GEM_GPU_DOMAINS; + + ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains); + if (ret) + return ret; + + trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains); + + ring->gpu_caches_dirty = false; + return 0; +} -- cgit v1.2.3 From e1ef7cc299839e68dae3f1843f62e52acda04538 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 24 Jul 2012 20:47:31 -0700 Subject: drm/i915: Macro to determine DPF support Originally I had a macro specifically for DPF support, and Daniel, with good reason asked me to change it to this. It's not the way I would have gone (and indeed I didn't), but for now there is no distinction as all platforms with L3 also have DPF. Note: The good reasons are that dpf is a l3$ feature (at least on currrent hw), hence I don't expect one to go without the other. Signed-off-by: Ben Widawsky [danvet: added note] Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 8b7085e4cf84..c58f1b91d08b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -454,7 +454,7 @@ static int init_render_ring(struct intel_ring_buffer *ring) if (INTEL_INFO(dev)->gen >= 6) I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); - if (IS_IVYBRIDGE(dev)) + if (HAS_L3_GPU_CACHE(dev)) I915_WRITE_IMR(ring, ~GEN6_RENDER_L3_PARITY_ERROR); return ret; @@ -844,7 +844,7 @@ gen6_ring_get_irq(struct intel_ring_buffer *ring) spin_lock_irqsave(&dev_priv->irq_lock, flags); if (ring->irq_refcount++ == 0) { - if (IS_IVYBRIDGE(dev) && ring->id == RCS) + if (HAS_L3_GPU_CACHE(dev) && ring->id == RCS) I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | GEN6_RENDER_L3_PARITY_ERROR)); else @@ -867,7 +867,7 @@ gen6_ring_put_irq(struct intel_ring_buffer *ring) spin_lock_irqsave(&dev_priv->irq_lock, flags); if (--ring->irq_refcount == 0) { - if (IS_IVYBRIDGE(dev) && ring->id == RCS) + if (HAS_L3_GPU_CACHE(dev) && ring->id == RCS) I915_WRITE_IMR(ring, ~GEN6_RENDER_L3_PARITY_ERROR); else I915_WRITE_IMR(ring, ~0); -- cgit v1.2.3 From 6c6cf5aa9c583478b19e23149feaa92d01fb8c2d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 18:02:28 +0100 Subject: drm/i915: Only apply the SNB pipe control w/a to gen6 The requirements for the sync flush to be emitted prior to the render cache flush is only true for SandyBridge. On IvyBridge and friends we can just emit the flushes with an inline CS stall. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index c58f1b91d08b..8733da529edf 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -214,15 +214,8 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring, u32 invalidate_domains, u32 flush_domains) { u32 flags = 0; - struct pipe_control *pc = ring->private; - u32 scratch_addr = pc->gtt_offset + 128; int ret; - /* Force SNB workarounds for PIPE_CONTROL flushes */ - ret = intel_emit_post_sync_nonzero_flush(ring); - if (ret) - return ret; - /* Just flush everything. Experiments have shown that reducing the * number of bits based on the write domains has little performance * impact. @@ -242,21 +235,33 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring, if (flush_domains) flags |= PIPE_CONTROL_CS_STALL; - ret = intel_ring_begin(ring, 6); + ret = intel_ring_begin(ring, 4); if (ret) return ret; - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5)); + intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); intel_ring_emit(ring, flags); - intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); - intel_ring_emit(ring, 0); /* lower dword */ - intel_ring_emit(ring, 0); /* uppwer dword */ - intel_ring_emit(ring, MI_NOOP); + intel_ring_emit(ring, 0); + intel_ring_emit(ring, 0); intel_ring_advance(ring); return 0; } +static int +gen6_render_ring_flush__wa(struct intel_ring_buffer *ring, + u32 invalidate_domains, u32 flush_domains) +{ + int ret; + + /* Force SNB workarounds for PIPE_CONTROL flushes */ + ret = intel_emit_post_sync_nonzero_flush(ring); + if (ret) + return ret; + + return gen6_render_ring_flush(ring, invalidate_domains, flush_domains); +} + static void ring_write_tail(struct intel_ring_buffer *ring, u32 value) { @@ -1371,6 +1376,8 @@ int intel_init_render_ring_buffer(struct drm_device *dev) if (INTEL_INFO(dev)->gen >= 6) { ring->add_request = gen6_add_request; ring->flush = gen6_render_ring_flush; + if (INTEL_INFO(dev)->gen == 6) + ring->flush = gen6_render_ring_flush__wa; ring->irq_get = gen6_ring_get_irq; ring->irq_put = gen6_ring_put_irq; ring->irq_enable_mask = GT_USER_INTERRUPT; -- cgit v1.2.3 From b2eadbc85b2c26df3fd2fe5c53c2a47cfd307249 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 9 Aug 2012 10:58:30 +0100 Subject: drm/i915: Lazily apply the SNB+ seqno w/a Avoid the forcewake overhead when simply retiring requests, as often the last seen seqno is good enough to satisfy the retirment process and will be promptly re-run in any case. Only ensure that we force the coherent seqno read when we are explicitly waiting upon a completion event to be sure that none go missing, and also for when we are reporting seqno values in case of error or debugging. This greatly reduces the load for userspace using the busy-ioctl to track active buffers, for instance halving the CPU used by X in pushing the pixels from a software render (flash). The effect will be even more magnified with userptr and so providing a zero-copy upload path in that instance, or in similar instances where X is simply compositing DRI buffers. v2: Reverse the polarity of the tachyon stream. Daniel suggested that 'force' was too generic for the parameter name and that 'lazy_coherency' better encapsulated the semantics of it being an optimization and its purpose. Also notice that gen6_get_seqno() is only used by gen6/7 chipsets and so the test for IS_GEN6 || IS_GEN7 is redundant in that function. Signed-off-by: Chris Wilson Reviewed-by: Daniel Vetter Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 8733da529edf..e278675cdff9 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -625,26 +625,24 @@ pc_render_add_request(struct intel_ring_buffer *ring, } static u32 -gen6_ring_get_seqno(struct intel_ring_buffer *ring) +gen6_ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency) { - struct drm_device *dev = ring->dev; - /* Workaround to force correct ordering between irq and seqno writes on * ivb (and maybe also on snb) by reading from a CS register (like * ACTHD) before reading the status page. */ - if (IS_GEN6(dev) || IS_GEN7(dev)) + if (!lazy_coherency) intel_ring_get_active_head(ring); return intel_read_status_page(ring, I915_GEM_HWS_INDEX); } static u32 -ring_get_seqno(struct intel_ring_buffer *ring) +ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency) { return intel_read_status_page(ring, I915_GEM_HWS_INDEX); } static u32 -pc_render_get_seqno(struct intel_ring_buffer *ring) +pc_render_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency) { struct pipe_control *pc = ring->private; return pc->cpu_page[0]; -- cgit v1.2.3 From 86a1ee26bb60e1ab8984e92f0e9186c354670aed Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 11 Aug 2012 15:41:04 +0100 Subject: drm/i915: Only pwrite through the GTT if there is space in the aperture Avoid stalling and waiting for the GPU by checking to see if there is sufficient inactive space in the aperture for us to bind the buffer prior to writing through the GTT. If there is inadequate space we will have to stall waiting for the GPU, and incur overheads moving objects about. Instead, only incur the clflush overhead on the target object by writing through shmem. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index c828169c73ae..ac93643731aa 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -391,7 +391,7 @@ init_pipe_control(struct intel_ring_buffer *ring) i915_gem_object_set_cache_level(obj, I915_CACHE_LLC); - ret = i915_gem_object_pin(obj, 4096, true); + ret = i915_gem_object_pin(obj, 4096, true, false); if (ret) goto err_unref; @@ -979,7 +979,7 @@ static int init_status_page(struct intel_ring_buffer *ring) i915_gem_object_set_cache_level(obj, I915_CACHE_LLC); - ret = i915_gem_object_pin(obj, 4096, true); + ret = i915_gem_object_pin(obj, 4096, true, false); if (ret != 0) { goto err_unref; } @@ -1036,7 +1036,7 @@ static int intel_init_ring_buffer(struct drm_device *dev, ring->obj = obj; - ret = i915_gem_object_pin(obj, PAGE_SIZE, true); + ret = i915_gem_object_pin(obj, PAGE_SIZE, true, false); if (ret) goto err_unref; -- cgit v1.2.3 From 4772eaebcdf86dd65630339dbe58316b90f80aed Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Fri, 17 Aug 2012 18:35:41 -0300 Subject: drm/i915: add gen7_render_ring_flush For now, just a copy of gen6_render_ring_flush. Different gens have different workarounds, so we want different functions. Signed-off-by: Paulo Zanoni Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 50 ++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index ac93643731aa..074b7d67c1c4 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -257,6 +257,54 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring, return 0; } +static int +gen7_render_ring_flush(struct intel_ring_buffer *ring, + u32 invalidate_domains, u32 flush_domains) +{ + u32 flags = 0; + struct pipe_control *pc = ring->private; + u32 scratch_addr = pc->gtt_offset + 128; + int ret; + + /* Just flush everything. Experiments have shown that reducing the + * number of bits based on the write domains has little performance + * impact. + */ + if (flush_domains) { + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + /* + * Ensure that any following seqno writes only happen + * when the render cache is indeed flushed. + */ + flags |= PIPE_CONTROL_CS_STALL; + } + if (invalidate_domains) { + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + /* + * TLB invalidate requires a post-sync write. + */ + flags |= PIPE_CONTROL_QW_WRITE; + } + + ret = intel_ring_begin(ring, 4); + if (ret) + return ret; + + intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); + intel_ring_emit(ring, flags); + intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); + intel_ring_emit(ring, 0); + intel_ring_advance(ring); + + return 0; +} + static int gen6_render_ring_flush__wa(struct intel_ring_buffer *ring, u32 invalidate_domains, u32 flush_domains) @@ -1385,7 +1433,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev) if (INTEL_INFO(dev)->gen >= 6) { ring->add_request = gen6_add_request; - ring->flush = gen6_render_ring_flush; + ring->flush = gen7_render_ring_flush; if (INTEL_INFO(dev)->gen == 6) ring->flush = gen6_render_ring_flush__wa; ring->irq_get = gen6_ring_get_irq; -- cgit v1.2.3 From b31115092724925a434905dc3dbf83a2e752ba4b Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Fri, 17 Aug 2012 18:35:42 -0300 Subject: drm/i915: add workarounds directly to gen6_render_ring_flush Since gen 7+ now run the new gen7_render_ring_flush function. Signed-off-by: Paulo Zanoni Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 074b7d67c1c4..42a4b85b0eae 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -218,6 +218,11 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring, u32 scratch_addr = pc->gtt_offset + 128; int ret; + /* Force SNB workarounds for PIPE_CONTROL flushes */ + ret = intel_emit_post_sync_nonzero_flush(ring); + if (ret) + return ret; + /* Just flush everything. Experiments have shown that reducing the * number of bits based on the write domains has little performance * impact. @@ -305,20 +310,6 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring, return 0; } -static int -gen6_render_ring_flush__wa(struct intel_ring_buffer *ring, - u32 invalidate_domains, u32 flush_domains) -{ - int ret; - - /* Force SNB workarounds for PIPE_CONTROL flushes */ - ret = intel_emit_post_sync_nonzero_flush(ring); - if (ret) - return ret; - - return gen6_render_ring_flush(ring, invalidate_domains, flush_domains); -} - static void ring_write_tail(struct intel_ring_buffer *ring, u32 value) { @@ -1435,7 +1426,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev) ring->add_request = gen6_add_request; ring->flush = gen7_render_ring_flush; if (INTEL_INFO(dev)->gen == 6) - ring->flush = gen6_render_ring_flush__wa; + ring->flush = gen6_render_ring_flush; ring->irq_get = gen6_ring_get_irq; ring->irq_put = gen6_ring_put_irq; ring->irq_enable_mask = GT_USER_INTERRUPT; -- cgit v1.2.3 From f39876317a69a104eeaed002d4085348e871bfd1 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Fri, 17 Aug 2012 18:35:43 -0300 Subject: drm/i915: add workarounds to gen7_render_ring_flush From Bspec, Vol 2a, Section 1.9.3.4 "PIPE_CONTROL", intro section detailing the various workarounds: "[DevIVB {W/A}, DevHSW {W/A}]: Pipe_control with CS-stall bit set must be issued before a pipe-control command that has the State Cache Invalidate bit set." Note that public Bspec has different numbering, it's Vol2Part1, Section 1.10.4.1 "PIPE_CONTROL" there. There's also a second workaround for the PIPE_CONTROL command itself: "[DevIVB, DevVLV, DevHSW] {WA}: Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with only read-cache-invalidate bit(s) set, must have a CS_STALL bit set" For simplicity we simply set the CS_STALL bit on every pipe_control on gen7+ Note that this massively helps on some hsw machines, together with the following patch to unconditionally set the CS_STALL bit on every pipe_control it prevents a gpu hang every few seconds. This is a regression that has been introduced in the pipe_control cleanup: commit 6c6cf5aa9c583478b19e23149feaa92d01fb8c2d Author: Chris Wilson Date: Fri Jul 20 18:02:28 2012 +0100 drm/i915: Only apply the SNB pipe control w/a to gen6 It looks like the massive snb pipe_control workaround also papered over any issues on ivb and hsw. Signed-off-by: Paulo Zanoni [danvet: squashed both workarounds together, pimped commit message with Bsepc citations, regression commit citation and changed the comment in the code a bit to clarify that we unconditionally set CS_STALL to avoid being hurt by trying to be clever.] Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 39 ++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 42a4b85b0eae..55cdb4d30a16 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -262,6 +262,25 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring, return 0; } +static int +gen7_render_ring_cs_stall_wa(struct intel_ring_buffer *ring) +{ + int ret; + + ret = intel_ring_begin(ring, 4); + if (ret) + return ret; + + intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); + intel_ring_emit(ring, PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + intel_ring_emit(ring, 0); + intel_ring_emit(ring, 0); + intel_ring_advance(ring); + + return 0; +} + static int gen7_render_ring_flush(struct intel_ring_buffer *ring, u32 invalidate_domains, u32 flush_domains) @@ -271,6 +290,16 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring, u32 scratch_addr = pc->gtt_offset + 128; int ret; + /* + * Ensure that any following seqno writes only happen when the render + * cache is indeed flushed. + * + * Workaround: 4th PIPE_CONTROL command (except the ones with only + * read-cache invalidate bits set) must have the CS_STALL bit set. We + * don't try to be clever and just set it unconditionally. + */ + flags |= PIPE_CONTROL_CS_STALL; + /* Just flush everything. Experiments have shown that reducing the * number of bits based on the write domains has little performance * impact. @@ -278,11 +307,6 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring, if (flush_domains) { flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; - /* - * Ensure that any following seqno writes only happen - * when the render cache is indeed flushed. - */ - flags |= PIPE_CONTROL_CS_STALL; } if (invalidate_domains) { flags |= PIPE_CONTROL_TLB_INVALIDATE; @@ -295,6 +319,11 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring, * TLB invalidate requires a post-sync write. */ flags |= PIPE_CONTROL_QW_WRITE; + + /* Workaround: we must issue a pipe_control with CS-stall bit + * set before a pipe_control command that has the state cache + * invalidate bit set. */ + gen7_render_ring_cs_stall_wa(ring); } ret = intel_ring_begin(ring, 4); -- cgit v1.2.3 From 9da3da660d8c19a54f6e93361d147509be3fff84 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 1 Jun 2012 15:20:22 +0100 Subject: drm/i915: Replace the array of pages with a scatterlist Rather than have multiple data structures for describing our page layout in conjunction with the array of pages, we can migrate all users over to a scatterlist. One major advantage, other than unifying the page tracking structures, this offers is that we replace the vmalloc'ed array (which can be up to a megabyte in size) with a chain of individual pages which helps reduce memory pressure. The disadvantage is that we then do not have a simple array to iterate, or to access randomly. The common case for this is in the relocation processing, which will typically fit within a single scatterlist page and so be almost the same cost as the simple array. For iterating over the array, the extra function call could be optimised away, but in reality is an insignificant cost of either binding the pages, or performing the pwrite/pread. v2: Fix drm_clflush_sg() to not invoke wbinvd as well! And fix the trivial compile error from rebasing. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_ringbuffer.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 55cdb4d30a16..984a0c5fbf5d 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -464,7 +464,7 @@ init_pipe_control(struct intel_ring_buffer *ring) goto err_unref; pc->gtt_offset = obj->gtt_offset; - pc->cpu_page = kmap(obj->pages[0]); + pc->cpu_page = kmap(sg_page(obj->pages->sgl)); if (pc->cpu_page == NULL) goto err_unpin; @@ -491,7 +491,8 @@ cleanup_pipe_control(struct intel_ring_buffer *ring) return; obj = pc->obj; - kunmap(obj->pages[0]); + + kunmap(sg_page(obj->pages->sgl)); i915_gem_object_unpin(obj); drm_gem_object_unreference(&obj->base); @@ -1026,7 +1027,7 @@ static void cleanup_status_page(struct intel_ring_buffer *ring) if (obj == NULL) return; - kunmap(obj->pages[0]); + kunmap(sg_page(obj->pages->sgl)); i915_gem_object_unpin(obj); drm_gem_object_unreference(&obj->base); ring->status_page.obj = NULL; @@ -1053,7 +1054,7 @@ static int init_status_page(struct intel_ring_buffer *ring) } ring->status_page.gfx_addr = obj->gtt_offset; - ring->status_page.page_addr = kmap(obj->pages[0]); + ring->status_page.page_addr = kmap(sg_page(obj->pages->sgl)); if (ring->status_page.page_addr == NULL) { ret = -ENOMEM; goto err_unpin; -- cgit v1.2.3