From 69c2fc891343cb5217c866d10709343cff190bdc Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 20 Jul 2012 12:41:03 +0100
Subject: drm/i915: Remove the per-ring write list

This is now handled by a global flag to ensure we emit a flush before
the next serialisation point (if we failed to queue one previously).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index bf0195a96d53..8f221d9a7bdb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1002,7 +1002,6 @@ static int intel_init_ring_buffer(struct drm_device *dev,
 	ring->dev = dev;
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
-	INIT_LIST_HEAD(&ring->gpu_write_list);
 	ring->size = 32 * PAGE_SIZE;
 
 	init_waitqueue_head(&ring->irq_queue);
@@ -1473,7 +1472,6 @@ int intel_render_ring_init_dri(struct drm_device *dev, u64 start, u32 size)
 	ring->dev = dev;
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
-	INIT_LIST_HEAD(&ring->gpu_write_list);
 
 	ring->size = size;
 	ring->effective_size = ring->size;
-- 
cgit v1.2.3


From a7b9761d0a2ded58170ffb4d423ff3d7228103f4 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 20 Jul 2012 12:41:08 +0100
Subject: drm/i915: Split i915_gem_flush_ring() into seperate invalidate/flush
 funcs

By moving the function to intel_ringbuffer and currying the appropriate
parameter, hopefully we make the callsites easier to read and
understand.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 38 +++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 8f221d9a7bdb..8b7085e4cf84 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1564,3 +1564,41 @@ int intel_init_blt_ring_buffer(struct drm_device *dev)
 
 	return intel_init_ring_buffer(dev, ring);
 }
+
+int
+intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
+{
+	int ret;
+
+	if (!ring->gpu_caches_dirty)
+		return 0;
+
+	ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS);
+	if (ret)
+		return ret;
+
+	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
+
+	ring->gpu_caches_dirty = false;
+	return 0;
+}
+
+int
+intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring)
+{
+	uint32_t flush_domains;
+	int ret;
+
+	flush_domains = 0;
+	if (ring->gpu_caches_dirty)
+		flush_domains = I915_GEM_GPU_DOMAINS;
+
+	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+	if (ret)
+		return ret;
+
+	trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+
+	ring->gpu_caches_dirty = false;
+	return 0;
+}
-- 
cgit v1.2.3


From e1ef7cc299839e68dae3f1843f62e52acda04538 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Tue, 24 Jul 2012 20:47:31 -0700
Subject: drm/i915: Macro to determine DPF support

Originally I had a macro specifically for DPF support, and Daniel, with
good reason asked me to change it to this. It's not the way I would have
gone (and indeed I didn't), but for now there is no distinction as all
platforms with L3 also have DPF.

Note: The good reasons are that dpf is a l3$ feature (at least on
currrent hw), hence I don't expect one to go without the other.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
[danvet: added note]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 8b7085e4cf84..c58f1b91d08b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -454,7 +454,7 @@ static int init_render_ring(struct intel_ring_buffer *ring)
 	if (INTEL_INFO(dev)->gen >= 6)
 		I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
 
-	if (IS_IVYBRIDGE(dev))
+	if (HAS_L3_GPU_CACHE(dev))
 		I915_WRITE_IMR(ring, ~GEN6_RENDER_L3_PARITY_ERROR);
 
 	return ret;
@@ -844,7 +844,7 @@ gen6_ring_get_irq(struct intel_ring_buffer *ring)
 
 	spin_lock_irqsave(&dev_priv->irq_lock, flags);
 	if (ring->irq_refcount++ == 0) {
-		if (IS_IVYBRIDGE(dev) && ring->id == RCS)
+		if (HAS_L3_GPU_CACHE(dev) && ring->id == RCS)
 			I915_WRITE_IMR(ring, ~(ring->irq_enable_mask |
 						GEN6_RENDER_L3_PARITY_ERROR));
 		else
@@ -867,7 +867,7 @@ gen6_ring_put_irq(struct intel_ring_buffer *ring)
 
 	spin_lock_irqsave(&dev_priv->irq_lock, flags);
 	if (--ring->irq_refcount == 0) {
-		if (IS_IVYBRIDGE(dev) && ring->id == RCS)
+		if (HAS_L3_GPU_CACHE(dev) && ring->id == RCS)
 			I915_WRITE_IMR(ring, ~GEN6_RENDER_L3_PARITY_ERROR);
 		else
 			I915_WRITE_IMR(ring, ~0);
-- 
cgit v1.2.3


From 6c6cf5aa9c583478b19e23149feaa92d01fb8c2d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 20 Jul 2012 18:02:28 +0100
Subject: drm/i915: Only apply the SNB pipe control w/a to gen6

The requirements for the sync flush to be emitted prior to the render
cache flush is only true for SandyBridge. On IvyBridge and friends we
can just emit the flushes with an inline CS stall.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index c58f1b91d08b..8733da529edf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -214,15 +214,8 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
                          u32 invalidate_domains, u32 flush_domains)
 {
 	u32 flags = 0;
-	struct pipe_control *pc = ring->private;
-	u32 scratch_addr = pc->gtt_offset + 128;
 	int ret;
 
-	/* Force SNB workarounds for PIPE_CONTROL flushes */
-	ret = intel_emit_post_sync_nonzero_flush(ring);
-	if (ret)
-		return ret;
-
 	/* Just flush everything.  Experiments have shown that reducing the
 	 * number of bits based on the write domains has little performance
 	 * impact.
@@ -242,21 +235,33 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
 	if (flush_domains)
 		flags |= PIPE_CONTROL_CS_STALL;
 
-	ret = intel_ring_begin(ring, 6);
+	ret = intel_ring_begin(ring, 4);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
+	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
 	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0); /* lower dword */
-	intel_ring_emit(ring, 0); /* uppwer dword */
-	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_emit(ring, 0);
+	intel_ring_emit(ring, 0);
 	intel_ring_advance(ring);
 
 	return 0;
 }
 
+static int
+gen6_render_ring_flush__wa(struct intel_ring_buffer *ring,
+			   u32 invalidate_domains, u32 flush_domains)
+{
+	int ret;
+
+	/* Force SNB workarounds for PIPE_CONTROL flushes */
+	ret = intel_emit_post_sync_nonzero_flush(ring);
+	if (ret)
+		return ret;
+
+	return gen6_render_ring_flush(ring, invalidate_domains, flush_domains);
+}
+
 static void ring_write_tail(struct intel_ring_buffer *ring,
 			    u32 value)
 {
@@ -1371,6 +1376,8 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
 	if (INTEL_INFO(dev)->gen >= 6) {
 		ring->add_request = gen6_add_request;
 		ring->flush = gen6_render_ring_flush;
+		if (INTEL_INFO(dev)->gen == 6)
+			ring->flush = gen6_render_ring_flush__wa;
 		ring->irq_get = gen6_ring_get_irq;
 		ring->irq_put = gen6_ring_put_irq;
 		ring->irq_enable_mask = GT_USER_INTERRUPT;
-- 
cgit v1.2.3


From b2eadbc85b2c26df3fd2fe5c53c2a47cfd307249 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 9 Aug 2012 10:58:30 +0100
Subject: drm/i915: Lazily apply the SNB+ seqno w/a

Avoid the forcewake overhead when simply retiring requests, as often the
last seen seqno is good enough to satisfy the retirment process and will
be promptly re-run in any case. Only ensure that we force the coherent
seqno read when we are explicitly waiting upon a completion event to be
sure that none go missing, and also for when we are reporting seqno
values in case of error or debugging.

This greatly reduces the load for userspace using the busy-ioctl to
track active buffers, for instance halving the CPU used by X in pushing
the pixels from a software render (flash). The effect will be even more
magnified with userptr and so providing a zero-copy upload path in that
instance, or in similar instances where X is simply compositing DRI
buffers.

v2: Reverse the polarity of the tachyon stream. Daniel suggested that
'force' was too generic for the parameter name and that 'lazy_coherency'
better encapsulated the semantics of it being an optimization and its
purpose. Also notice that gen6_get_seqno() is only used by gen6/7
chipsets and so the test for IS_GEN6 || IS_GEN7 is redundant in that
function.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 8733da529edf..e278675cdff9 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -625,26 +625,24 @@ pc_render_add_request(struct intel_ring_buffer *ring,
 }
 
 static u32
-gen6_ring_get_seqno(struct intel_ring_buffer *ring)
+gen6_ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
 {
-	struct drm_device *dev = ring->dev;
-
 	/* Workaround to force correct ordering between irq and seqno writes on
 	 * ivb (and maybe also on snb) by reading from a CS register (like
 	 * ACTHD) before reading the status page. */
-	if (IS_GEN6(dev) || IS_GEN7(dev))
+	if (!lazy_coherency)
 		intel_ring_get_active_head(ring);
 	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
 }
 
 static u32
-ring_get_seqno(struct intel_ring_buffer *ring)
+ring_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
 {
 	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
 }
 
 static u32
-pc_render_get_seqno(struct intel_ring_buffer *ring)
+pc_render_get_seqno(struct intel_ring_buffer *ring, bool lazy_coherency)
 {
 	struct pipe_control *pc = ring->private;
 	return pc->cpu_page[0];
-- 
cgit v1.2.3


From 86a1ee26bb60e1ab8984e92f0e9186c354670aed Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 11 Aug 2012 15:41:04 +0100
Subject: drm/i915: Only pwrite through the GTT if there is space in the
 aperture

Avoid stalling and waiting for the GPU by checking to see if there is
sufficient inactive space in the aperture for us to bind the buffer
prior to writing through the GTT. If there is inadequate space we will
have to stall waiting for the GPU, and incur overheads moving objects
about. Instead, only incur the clflush overhead on the target object by
writing through shmem.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index c828169c73ae..ac93643731aa 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -391,7 +391,7 @@ init_pipe_control(struct intel_ring_buffer *ring)
 
 	i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
 
-	ret = i915_gem_object_pin(obj, 4096, true);
+	ret = i915_gem_object_pin(obj, 4096, true, false);
 	if (ret)
 		goto err_unref;
 
@@ -979,7 +979,7 @@ static int init_status_page(struct intel_ring_buffer *ring)
 
 	i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
 
-	ret = i915_gem_object_pin(obj, 4096, true);
+	ret = i915_gem_object_pin(obj, 4096, true, false);
 	if (ret != 0) {
 		goto err_unref;
 	}
@@ -1036,7 +1036,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
 
 	ring->obj = obj;
 
-	ret = i915_gem_object_pin(obj, PAGE_SIZE, true);
+	ret = i915_gem_object_pin(obj, PAGE_SIZE, true, false);
 	if (ret)
 		goto err_unref;
 
-- 
cgit v1.2.3


From 4772eaebcdf86dd65630339dbe58316b90f80aed Mon Sep 17 00:00:00 2001
From: Paulo Zanoni <paulo.r.zanoni@intel.com>
Date: Fri, 17 Aug 2012 18:35:41 -0300
Subject: drm/i915: add gen7_render_ring_flush

For now, just a copy of gen6_render_ring_flush. Different gens have
different workarounds, so we want different functions.

Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 50 ++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index ac93643731aa..074b7d67c1c4 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -257,6 +257,54 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
 	return 0;
 }
 
+static int
+gen7_render_ring_flush(struct intel_ring_buffer *ring,
+		       u32 invalidate_domains, u32 flush_domains)
+{
+	u32 flags = 0;
+	struct pipe_control *pc = ring->private;
+	u32 scratch_addr = pc->gtt_offset + 128;
+	int ret;
+
+	/* Just flush everything.  Experiments have shown that reducing the
+	 * number of bits based on the write domains has little performance
+	 * impact.
+	 */
+	if (flush_domains) {
+		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+		/*
+		 * Ensure that any following seqno writes only happen
+		 * when the render cache is indeed flushed.
+		 */
+		flags |= PIPE_CONTROL_CS_STALL;
+	}
+	if (invalidate_domains) {
+		flags |= PIPE_CONTROL_TLB_INVALIDATE;
+		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+		/*
+		 * TLB invalidate requires a post-sync write.
+		 */
+		flags |= PIPE_CONTROL_QW_WRITE;
+	}
+
+	ret = intel_ring_begin(ring, 4);
+	if (ret)
+		return ret;
+
+	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
+	intel_ring_emit(ring, flags);
+	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
+	intel_ring_emit(ring, 0);
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
 static int
 gen6_render_ring_flush__wa(struct intel_ring_buffer *ring,
 			   u32 invalidate_domains, u32 flush_domains)
@@ -1385,7 +1433,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
 
 	if (INTEL_INFO(dev)->gen >= 6) {
 		ring->add_request = gen6_add_request;
-		ring->flush = gen6_render_ring_flush;
+		ring->flush = gen7_render_ring_flush;
 		if (INTEL_INFO(dev)->gen == 6)
 			ring->flush = gen6_render_ring_flush__wa;
 		ring->irq_get = gen6_ring_get_irq;
-- 
cgit v1.2.3


From b31115092724925a434905dc3dbf83a2e752ba4b Mon Sep 17 00:00:00 2001
From: Paulo Zanoni <paulo.r.zanoni@intel.com>
Date: Fri, 17 Aug 2012 18:35:42 -0300
Subject: drm/i915: add workarounds directly to gen6_render_ring_flush

Since gen 7+ now run the new gen7_render_ring_flush function.

Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 074b7d67c1c4..42a4b85b0eae 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -218,6 +218,11 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
 	u32 scratch_addr = pc->gtt_offset + 128;
 	int ret;
 
+	/* Force SNB workarounds for PIPE_CONTROL flushes */
+	ret = intel_emit_post_sync_nonzero_flush(ring);
+	if (ret)
+		return ret;
+
 	/* Just flush everything.  Experiments have shown that reducing the
 	 * number of bits based on the write domains has little performance
 	 * impact.
@@ -305,20 +310,6 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
 	return 0;
 }
 
-static int
-gen6_render_ring_flush__wa(struct intel_ring_buffer *ring,
-			   u32 invalidate_domains, u32 flush_domains)
-{
-	int ret;
-
-	/* Force SNB workarounds for PIPE_CONTROL flushes */
-	ret = intel_emit_post_sync_nonzero_flush(ring);
-	if (ret)
-		return ret;
-
-	return gen6_render_ring_flush(ring, invalidate_domains, flush_domains);
-}
-
 static void ring_write_tail(struct intel_ring_buffer *ring,
 			    u32 value)
 {
@@ -1435,7 +1426,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
 		ring->add_request = gen6_add_request;
 		ring->flush = gen7_render_ring_flush;
 		if (INTEL_INFO(dev)->gen == 6)
-			ring->flush = gen6_render_ring_flush__wa;
+			ring->flush = gen6_render_ring_flush;
 		ring->irq_get = gen6_ring_get_irq;
 		ring->irq_put = gen6_ring_put_irq;
 		ring->irq_enable_mask = GT_USER_INTERRUPT;
-- 
cgit v1.2.3


From f39876317a69a104eeaed002d4085348e871bfd1 Mon Sep 17 00:00:00 2001
From: Paulo Zanoni <paulo.r.zanoni@intel.com>
Date: Fri, 17 Aug 2012 18:35:43 -0300
Subject: drm/i915: add workarounds to gen7_render_ring_flush

From Bspec, Vol 2a, Section 1.9.3.4 "PIPE_CONTROL", intro section
detailing the various workarounds:

"[DevIVB {W/A}, DevHSW {W/A}]: Pipe_control with CS-stall bit
set must be issued before a pipe-control command that has the State
Cache Invalidate bit set."

Note that public Bspec has different numbering, it's Vol2Part1,
Section 1.10.4.1 "PIPE_CONTROL" there.

There's also a second workaround for the PIPE_CONTROL command itself:

"[DevIVB, DevVLV, DevHSW] {WA}: Every 4th PIPE_CONTROL command, not
counting the PIPE_CONTROL with only read-cache-invalidate bit(s) set,
must have a CS_STALL bit set"

For simplicity we simply set the CS_STALL bit on every pipe_control on
gen7+

Note that this massively helps on some hsw machines, together with the
following patch to unconditionally set the CS_STALL bit on every
pipe_control it prevents a gpu hang every few seconds.

This is a regression that has been introduced in the pipe_control
cleanup:

commit 6c6cf5aa9c583478b19e23149feaa92d01fb8c2d
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Fri Jul 20 18:02:28 2012 +0100

    drm/i915: Only apply the SNB pipe control w/a to gen6

It looks like the massive snb pipe_control workaround also papered
over any issues on ivb and hsw.

Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
[danvet: squashed both workarounds together, pimped commit message
with Bsepc citations, regression commit citation and changed the
comment in the code a bit to clarify that we unconditionally set
CS_STALL to avoid being hurt by trying to be clever.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 39 ++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 42a4b85b0eae..55cdb4d30a16 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -262,6 +262,25 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
 	return 0;
 }
 
+static int
+gen7_render_ring_cs_stall_wa(struct intel_ring_buffer *ring)
+{
+	int ret;
+
+	ret = intel_ring_begin(ring, 4);
+	if (ret)
+		return ret;
+
+	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
+	intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
+			      PIPE_CONTROL_STALL_AT_SCOREBOARD);
+	intel_ring_emit(ring, 0);
+	intel_ring_emit(ring, 0);
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
 static int
 gen7_render_ring_flush(struct intel_ring_buffer *ring,
 		       u32 invalidate_domains, u32 flush_domains)
@@ -271,6 +290,16 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
 	u32 scratch_addr = pc->gtt_offset + 128;
 	int ret;
 
+	/*
+	 * Ensure that any following seqno writes only happen when the render
+	 * cache is indeed flushed.
+	 *
+	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
+	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
+	 * don't try to be clever and just set it unconditionally.
+	 */
+	flags |= PIPE_CONTROL_CS_STALL;
+
 	/* Just flush everything.  Experiments have shown that reducing the
 	 * number of bits based on the write domains has little performance
 	 * impact.
@@ -278,11 +307,6 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
 	if (flush_domains) {
 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
-		/*
-		 * Ensure that any following seqno writes only happen
-		 * when the render cache is indeed flushed.
-		 */
-		flags |= PIPE_CONTROL_CS_STALL;
 	}
 	if (invalidate_domains) {
 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
@@ -295,6 +319,11 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
 		 * TLB invalidate requires a post-sync write.
 		 */
 		flags |= PIPE_CONTROL_QW_WRITE;
+
+		/* Workaround: we must issue a pipe_control with CS-stall bit
+		 * set before a pipe_control command that has the state cache
+		 * invalidate bit set. */
+		gen7_render_ring_cs_stall_wa(ring);
 	}
 
 	ret = intel_ring_begin(ring, 4);
-- 
cgit v1.2.3


From 9da3da660d8c19a54f6e93361d147509be3fff84 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 1 Jun 2012 15:20:22 +0100
Subject: drm/i915: Replace the array of pages with a scatterlist

Rather than have multiple data structures for describing our page layout
in conjunction with the array of pages, we can migrate all users over to
a scatterlist.

One major advantage, other than unifying the page tracking structures,
this offers is that we replace the vmalloc'ed array (which can be up to
a megabyte in size) with a chain of individual pages which helps reduce
memory pressure.

The disadvantage is that we then do not have a simple array to iterate,
or to access randomly. The common case for this is in the relocation
processing, which will typically fit within a single scatterlist page
and so be almost the same cost as the simple array. For iterating over
the array, the extra function call could be optimised away, but in
reality is an insignificant cost of either binding the pages, or
performing the pwrite/pread.

v2: Fix drm_clflush_sg() to not invoke wbinvd as well! And fix the
trivial compile error from rebasing.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers/gpu/drm/i915/intel_ringbuffer.c')

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 55cdb4d30a16..984a0c5fbf5d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -464,7 +464,7 @@ init_pipe_control(struct intel_ring_buffer *ring)
 		goto err_unref;
 
 	pc->gtt_offset = obj->gtt_offset;
-	pc->cpu_page =  kmap(obj->pages[0]);
+	pc->cpu_page =  kmap(sg_page(obj->pages->sgl));
 	if (pc->cpu_page == NULL)
 		goto err_unpin;
 
@@ -491,7 +491,8 @@ cleanup_pipe_control(struct intel_ring_buffer *ring)
 		return;
 
 	obj = pc->obj;
-	kunmap(obj->pages[0]);
+
+	kunmap(sg_page(obj->pages->sgl));
 	i915_gem_object_unpin(obj);
 	drm_gem_object_unreference(&obj->base);
 
@@ -1026,7 +1027,7 @@ static void cleanup_status_page(struct intel_ring_buffer *ring)
 	if (obj == NULL)
 		return;
 
-	kunmap(obj->pages[0]);
+	kunmap(sg_page(obj->pages->sgl));
 	i915_gem_object_unpin(obj);
 	drm_gem_object_unreference(&obj->base);
 	ring->status_page.obj = NULL;
@@ -1053,7 +1054,7 @@ static int init_status_page(struct intel_ring_buffer *ring)
 	}
 
 	ring->status_page.gfx_addr = obj->gtt_offset;
-	ring->status_page.page_addr = kmap(obj->pages[0]);
+	ring->status_page.page_addr = kmap(sg_page(obj->pages->sgl));
 	if (ring->status_page.page_addr == NULL) {
 		ret = -ENOMEM;
 		goto err_unpin;
-- 
cgit v1.2.3