Re: [PATCH] media: verisilicon: Add reference buffers compression

From: Benjamin Gaignard
Date: Thu May 16 2024 - 03:49:42 EST



Le 15/05/2024 à 19:31, Nicolas Dufresne a écrit :
Hi,

Le mardi 14 mai 2024 à 18:20 +0200, Benjamin Gaignard a écrit :
Reference frame compression is a feature added in G2 decoder to compress
frame buffers so that the bandwidth of storing/loading reference frames
can be reduced, especially when the resolution of decoded stream is of
high definition.

The impact of compressed frames is confirmed when using perf to monitor
the number of memory accesses with or without compression feature.
The following command
perf stat -a -e imx8_ddr0/cycles/,imx8_ddr0/read-cycles/,imx8_ddr0/write-cycles/ gst-launch-1.0 filesrc location=Jockey_3840x2160_120fps_420_8bit_HEVC_RAW.hevc ! queue ! h265parse ! v4l2slh265dec ! fakesink

give us these results
without compression feature:
Performance counter stats for 'system wide':

1711300345 imx8_ddr0/cycles/
892207924 imx8_ddr0/read-cycles/
1291785864 imx8_ddr0/write-cycles/

13.760048353 seconds time elapsed

with compression feature:
Performance counter stats for 'system wide':

274526799 imx8_ddr0/cycles/
453120194 imx8_ddr0/read-cycles/
833391434 imx8_ddr0/write-cycles/

18.257831534 seconds time elapsed
On my own testing the duration remained the same. Perhaps you had something else
running during your test ? I would suggest to always runs multiple time such a
test before sharing to avoid snapshot effect and possibly wrong interpretation

You need to force the decoder to produce NV12 to see the effect of patch.
I will update the test command line in the commit message.


regards,
Nicolas

As expected the number of read/write cycles are really lower when compression
is used.

Since storing compression data requires more memory a module
parameter named 'hevc_use_compression' is used to enable/disable
this feature and, by default, compression isn't used.
Perhaps a KConfig to change the default ?

That will be in v2
Regards,
Benjamin


Enabling compression feature means that decoder output frames
are stored with a specific compression pixel format. Since this
pixel format is unknown, this patch restrain compression feature
usage to the cases where post-processor pixels formats (NV12 or NV15)
are selected by the applications.

Fluster compliance HEVC test suite score is still 141/147 after this patch.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@xxxxxxxxxxxxx>
Tested-by: Nicolas Dufresne <nicolas.dufresne@xxxxxxxxxxxxx>

---
.../media/platform/verisilicon/hantro_g2.c | 35 +++++++++++++++++
.../platform/verisilicon/hantro_g2_hevc_dec.c | 20 ++++++++--
.../platform/verisilicon/hantro_g2_regs.h | 4 ++
.../media/platform/verisilicon/hantro_hevc.c | 8 ++++
.../media/platform/verisilicon/hantro_hw.h | 39 +++++++++++++++++++
.../platform/verisilicon/hantro_postproc.c | 6 ++-
6 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/drivers/media/platform/verisilicon/hantro_g2.c b/drivers/media/platform/verisilicon/hantro_g2.c
index b880a6849d58..62ca91427360 100644
--- a/drivers/media/platform/verisilicon/hantro_g2.c
+++ b/drivers/media/platform/verisilicon/hantro_g2.c
@@ -9,6 +9,12 @@
#include "hantro_g2_regs.h"
#define G2_ALIGN 16
+#define CBS_SIZE 16 /* compression table size in bytes */
+#define CBS_LUMA 8 /* luminance CBS is composed of 1 8x8 coded block */
+#define CBS_CHROMA_W (8 * 2) /* chrominance CBS is composed of two 8x4 coded
+ * blocks, with Cb CB first then Cr CB following
+ */
+#define CBS_CHROMA_H 4
void hantro_g2_check_idle(struct hantro_dev *vpu)
{
@@ -56,3 +62,32 @@ size_t hantro_g2_motion_vectors_offset(struct hantro_ctx *ctx)
return ALIGN((cr_offset * 3) / 2, G2_ALIGN);
}
+
+static size_t hantro_g2_mv_size(struct hantro_ctx *ctx)
+{
+ const struct hantro_hevc_dec_ctrls *ctrls = &ctx->hevc_dec.ctrls;
+ const struct v4l2_ctrl_hevc_sps *sps = ctrls->sps;
+ unsigned int pic_width_in_ctbs, pic_height_in_ctbs;
+ unsigned int max_log2_ctb_size;
+
+ max_log2_ctb_size = sps->log2_min_luma_coding_block_size_minus3 + 3 +
+ sps->log2_diff_max_min_luma_coding_block_size;
+ pic_width_in_ctbs = (sps->pic_width_in_luma_samples +
+ (1 << max_log2_ctb_size) - 1) >> max_log2_ctb_size;
+ pic_height_in_ctbs = (sps->pic_height_in_luma_samples + (1 << max_log2_ctb_size) - 1)
+ >> max_log2_ctb_size;
+
+ return pic_width_in_ctbs * pic_height_in_ctbs * (1 << (2 * (max_log2_ctb_size - 4))) * 16;
+}
+
+size_t hantro_g2_luma_compress_offset(struct hantro_ctx *ctx)
+{
+ return hantro_g2_motion_vectors_offset(ctx) +
+ hantro_g2_mv_size(ctx);
+}
+
+size_t hantro_g2_chroma_compress_offset(struct hantro_ctx *ctx)
+{
+ return hantro_g2_luma_compress_offset(ctx) +
+ hantro_hevc_luma_compressed_size(ctx->dst_fmt.width, ctx->dst_fmt.height);
+}
diff --git a/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c b/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c
index d3f8c33eb16c..85a44143b378 100644
--- a/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c
+++ b/drivers/media/platform/verisilicon/hantro_g2_hevc_dec.c
@@ -367,11 +367,14 @@ static int set_ref(struct hantro_ctx *ctx)
const struct v4l2_ctrl_hevc_decode_params *decode_params = ctrls->decode_params;
const struct v4l2_hevc_dpb_entry *dpb = decode_params->dpb;
dma_addr_t luma_addr, chroma_addr, mv_addr = 0;
+ dma_addr_t compress_luma_addr, compress_chroma_addr = 0;
struct hantro_dev *vpu = ctx->dev;
struct vb2_v4l2_buffer *vb2_dst;
struct hantro_decoded_buffer *dst;
size_t cr_offset = hantro_g2_chroma_offset(ctx);
size_t mv_offset = hantro_g2_motion_vectors_offset(ctx);
+ size_t compress_luma_offset = hantro_g2_luma_compress_offset(ctx);
+ size_t compress_chroma_offset = hantro_g2_chroma_compress_offset(ctx);
u32 max_ref_frames;
u16 dpb_longterm_e;
static const struct hantro_reg cur_poc[] = {
@@ -445,6 +448,8 @@ static int set_ref(struct hantro_ctx *ctx)
chroma_addr = luma_addr + cr_offset;
mv_addr = luma_addr + mv_offset;
+ compress_luma_addr = luma_addr + compress_luma_offset;
+ compress_chroma_addr = luma_addr + compress_chroma_offset;
if (dpb[i].flags & V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE)
dpb_longterm_e |= BIT(V4L2_HEVC_DPB_ENTRIES_NUM_MAX - 1 - i);
@@ -452,6 +457,8 @@ static int set_ref(struct hantro_ctx *ctx)
hantro_write_addr(vpu, G2_REF_LUMA_ADDR(i), luma_addr);
hantro_write_addr(vpu, G2_REF_CHROMA_ADDR(i), chroma_addr);
hantro_write_addr(vpu, G2_REF_MV_ADDR(i), mv_addr);
+ hantro_write_addr(vpu, G2_REF_COMP_LUMA_ADDR(i), compress_luma_addr);
+ hantro_write_addr(vpu, G2_REF_COMP_CHROMA_ADDR(i), compress_chroma_addr);
}
vb2_dst = hantro_get_dst_buf(ctx);
@@ -465,19 +472,27 @@ static int set_ref(struct hantro_ctx *ctx)
chroma_addr = luma_addr + cr_offset;
mv_addr = luma_addr + mv_offset;
+ compress_luma_addr = luma_addr + compress_luma_offset;
+ compress_chroma_addr = luma_addr + compress_chroma_offset;
hantro_write_addr(vpu, G2_REF_LUMA_ADDR(i), luma_addr);
hantro_write_addr(vpu, G2_REF_CHROMA_ADDR(i), chroma_addr);
- hantro_write_addr(vpu, G2_REF_MV_ADDR(i++), mv_addr);
+ hantro_write_addr(vpu, G2_REF_MV_ADDR(i), mv_addr);
+ hantro_write_addr(vpu, G2_REF_COMP_LUMA_ADDR(i), compress_luma_addr);
+ hantro_write_addr(vpu, G2_REF_COMP_CHROMA_ADDR(i++), compress_chroma_addr);
hantro_write_addr(vpu, G2_OUT_LUMA_ADDR, luma_addr);
hantro_write_addr(vpu, G2_OUT_CHROMA_ADDR, chroma_addr);
hantro_write_addr(vpu, G2_OUT_MV_ADDR, mv_addr);
+ hantro_write_addr(vpu, G2_OUT_COMP_LUMA_ADDR, compress_luma_addr);
+ hantro_write_addr(vpu, G2_OUT_COMP_CHROMA_ADDR, compress_chroma_addr);
for (; i < V4L2_HEVC_DPB_ENTRIES_NUM_MAX; i++) {
hantro_write_addr(vpu, G2_REF_LUMA_ADDR(i), 0);
hantro_write_addr(vpu, G2_REF_CHROMA_ADDR(i), 0);
hantro_write_addr(vpu, G2_REF_MV_ADDR(i), 0);
+ hantro_write_addr(vpu, G2_REF_COMP_LUMA_ADDR(i), 0);
+ hantro_write_addr(vpu, G2_REF_COMP_CHROMA_ADDR(i), 0);
}
hantro_reg_write(vpu, &g2_refer_lterm_e, dpb_longterm_e);
@@ -594,8 +609,7 @@ int hantro_g2_hevc_dec_run(struct hantro_ctx *ctx)
/* Don't disable output */
hantro_reg_write(vpu, &g2_out_dis, 0);
- /* Don't compress buffers */
- hantro_reg_write(vpu, &g2_ref_compress_bypass, 1);
+ hantro_reg_write(vpu, &g2_ref_compress_bypass, !ctx->hevc_dec.use_compression);
/* Bus width and max burst */
hantro_reg_write(vpu, &g2_buswidth, BUS_WIDTH_128);
diff --git a/drivers/media/platform/verisilicon/hantro_g2_regs.h b/drivers/media/platform/verisilicon/hantro_g2_regs.h
index 82606783591a..b943b1816db7 100644
--- a/drivers/media/platform/verisilicon/hantro_g2_regs.h
+++ b/drivers/media/platform/verisilicon/hantro_g2_regs.h
@@ -318,6 +318,10 @@
#define G2_TILE_BSD_ADDR (G2_SWREG(183))
#define G2_DS_DST (G2_SWREG(186))
#define G2_DS_DST_CHR (G2_SWREG(188))
+#define G2_OUT_COMP_LUMA_ADDR (G2_SWREG(190))
+#define G2_REF_COMP_LUMA_ADDR(i) (G2_SWREG(192) + ((i) * 0x8))
+#define G2_OUT_COMP_CHROMA_ADDR (G2_SWREG(224))
+#define G2_REF_COMP_CHROMA_ADDR(i) (G2_SWREG(226) + ((i) * 0x8))
#define g2_strm_buffer_len G2_DEC_REG(258, 0, 0xffffffff)
#define g2_strm_start_offset G2_DEC_REG(259, 0, 0xffffffff)
diff --git a/drivers/media/platform/verisilicon/hantro_hevc.c b/drivers/media/platform/verisilicon/hantro_hevc.c
index 2c14330bc562..895dc0c76c74 100644
--- a/drivers/media/platform/verisilicon/hantro_hevc.c
+++ b/drivers/media/platform/verisilicon/hantro_hevc.c
@@ -25,6 +25,11 @@
#define MAX_TILE_COLS 20
#define MAX_TILE_ROWS 22
+bool hevc_use_compression;
+module_param_named(hevc_use_compression, hevc_use_compression, bool, 0644);
+MODULE_PARM_DESC(hevc_use_compression,
+ "Use reference frame compression for HEVC");
+
void hantro_hevc_ref_init(struct hantro_ctx *ctx)
{
struct hantro_hevc_dec_hw_ctx *hevc_dec = &ctx->hevc_dec;
@@ -275,5 +280,8 @@ int hantro_hevc_dec_init(struct hantro_ctx *ctx)
hantro_hevc_ref_init(ctx);
+ hevc_dec->use_compression =
+ hevc_use_compression & hantro_needs_postproc(ctx, ctx->vpu_dst_fmt);
+
return 0;
}
diff --git a/drivers/media/platform/verisilicon/hantro_hw.h b/drivers/media/platform/verisilicon/hantro_hw.h
index 7737320cc8cc..43d4ff637376 100644
--- a/drivers/media/platform/verisilicon/hantro_hw.h
+++ b/drivers/media/platform/verisilicon/hantro_hw.h
@@ -42,6 +42,14 @@
#define MAX_POSTPROC_BUFFERS 64
+#define G2_ALIGN 16
+#define CBS_SIZE 16 /* compression table size in bytes */
+#define CBS_LUMA 8 /* luminance CBS is composed of 1 8x8 coded block */
+#define CBS_CHROMA_W (8 * 2) /* chrominance CBS is composed of two 8x4 coded
+ * blocks, with Cb CB first then Cr CB following
+ */
+#define CBS_CHROMA_H 4
+
struct hantro_dev;
struct hantro_ctx;
struct hantro_buf;
@@ -144,6 +152,7 @@ struct hantro_hevc_dec_ctrls {
* @ref_bufs_used: Bitfield of used reference buffers
* @ctrls: V4L2 controls attached to a run
* @num_tile_cols_allocated: number of allocated tiles
+ * @use_compression: use reference buffer compression
*/
struct hantro_hevc_dec_hw_ctx {
struct hantro_aux_buf tile_sizes;
@@ -156,6 +165,7 @@ struct hantro_hevc_dec_hw_ctx {
u32 ref_bufs_used;
struct hantro_hevc_dec_ctrls ctrls;
unsigned int num_tile_cols_allocated;
+ bool use_compression;
};
/**
@@ -510,6 +520,33 @@ hantro_hevc_mv_size(unsigned int width, unsigned int height)
return width * height / 16;
}
+static inline size_t
+hantro_hevc_luma_compressed_size(unsigned int width, unsigned int height)
+{
+ u32 pic_width_in_cbsy =
+ round_up((width + CBS_LUMA - 1) / CBS_LUMA, CBS_SIZE);
+ u32 pic_height_in_cbsy = (height + CBS_LUMA - 1) / CBS_LUMA;
+
+ return round_up(pic_width_in_cbsy * pic_height_in_cbsy, CBS_SIZE);
+}
+
+static inline size_t
+hantro_hevc_chroma_compressed_size(unsigned int width, unsigned int height)
+{
+ u32 pic_width_in_cbsc =
+ round_up((width + CBS_CHROMA_W - 1) / CBS_CHROMA_W, CBS_SIZE);
+ u32 pic_height_in_cbsc = (height / 2 + CBS_CHROMA_H - 1) / CBS_CHROMA_H;
+
+ return round_up(pic_width_in_cbsc * pic_height_in_cbsc, CBS_SIZE);
+}
+
+static inline size_t
+hantro_hevc_compressed_size(unsigned int width, unsigned int height)
+{
+ return hantro_hevc_luma_compressed_size(width, height) +
+ hantro_hevc_chroma_compressed_size(width, height);
+}
+
static inline unsigned short hantro_av1_num_sbs(unsigned short dimension)
{
return DIV_ROUND_UP(dimension, 64);
@@ -525,6 +562,8 @@ hantro_av1_mv_size(unsigned int width, unsigned int height)
size_t hantro_g2_chroma_offset(struct hantro_ctx *ctx);
size_t hantro_g2_motion_vectors_offset(struct hantro_ctx *ctx);
+size_t hantro_g2_luma_compress_offset(struct hantro_ctx *ctx);
+size_t hantro_g2_chroma_compress_offset(struct hantro_ctx *ctx);
int hantro_g1_mpeg2_dec_run(struct hantro_ctx *ctx);
int rockchip_vpu2_mpeg2_dec_run(struct hantro_ctx *ctx);
diff --git a/drivers/media/platform/verisilicon/hantro_postproc.c b/drivers/media/platform/verisilicon/hantro_postproc.c
index 41e93176300b..232c93eea7ee 100644
--- a/drivers/media/platform/verisilicon/hantro_postproc.c
+++ b/drivers/media/platform/verisilicon/hantro_postproc.c
@@ -213,9 +213,13 @@ static unsigned int hantro_postproc_buffer_size(struct hantro_ctx *ctx)
else if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_VP9_FRAME)
buf_size += hantro_vp9_mv_size(pix_mp.width,
pix_mp.height);
- else if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_HEVC_SLICE)
+ else if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_HEVC_SLICE) {
buf_size += hantro_hevc_mv_size(pix_mp.width,
pix_mp.height);
+ if (ctx->hevc_dec.use_compression)
+ buf_size += hantro_hevc_compressed_size(pix_mp.width,
+ pix_mp.height);
+ }
else if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_AV1_FRAME)
buf_size += hantro_av1_mv_size(pix_mp.width,
pix_mp.height);