aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/CODE_OF_CONDUCT.md7
-rw-r--r--.github/actions/docker-build/action.yml2
-rw-r--r--.github/workflows/cancel-redundant-workflows.yml4
-rw-r--r--.github/workflows/compliance.yml8
-rwxr-xr-x.github/workflows/docker-images/dco-check/entrypoint.sh8
-rwxr-xr-x.github/workflows/docker-images/yocto-builder/entrypoint-build.sh5
-rw-r--r--.github/workflows/mirror.yml12
-rw-r--r--.github/workflows/yocto-builds.yml29
-rw-r--r--.github/workflows/yocto-layer.yml66
-rw-r--r--.readthedocs.yaml5
-rw-r--r--README.md25
-rw-r--r--classes/sdcard_image-rpi.bbclass7
-rw-r--r--conf/layer.conf2
-rw-r--r--conf/machine/include/rpi-base.inc62
-rw-r--r--conf/machine/include/rpi-default-providers.inc4
-rw-r--r--conf/machine/include/rpi-default-versions.inc3
-rw-r--r--conf/machine/raspberrypi-armv7.conf39
-rw-r--r--conf/machine/raspberrypi-armv8.conf45
-rw-r--r--conf/machine/raspberrypi-cm.conf2
-rw-r--r--conf/machine/raspberrypi.conf4
-rw-r--r--conf/machine/raspberrypi0-2w-64.conf4
-rw-r--r--conf/machine/raspberrypi0-2w.conf8
-rw-r--r--conf/machine/raspberrypi0.conf2
-rw-r--r--conf/machine/raspberrypi2.conf2
-rw-r--r--conf/machine/raspberrypi3-64.conf9
-rw-r--r--conf/machine/raspberrypi4-64.conf5
-rw-r--r--conf/machine/raspberrypi5.conf26
-rw-r--r--docs/conf.py6
-rw-r--r--docs/extra-build-config.md92
-rw-r--r--docs/index.rst1
-rw-r--r--docs/ipcompliance.md23
-rw-r--r--docs/layer-contents.md22
-rw-r--r--docs/requirements.txt1
-rw-r--r--dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch175
-rw-r--r--dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.%.bbappend (renamed from dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.1.bbappend)1
-rw-r--r--dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb6
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch42
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch271
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb44
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend2
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend2
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch25
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch27
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch83
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch13826
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch19
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch19
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch53
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch26
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch30
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch236
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch33
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch124
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch61
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch43
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch60
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch149
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch43
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch36
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch97
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch59
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch59
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb165
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb (renamed from dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-blinka_6.2.2.bb)0
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb (renamed from dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-busdevice_5.0.5.bb)0
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb (renamed from dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-motor_3.2.6.bb)0
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb (renamed from dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-motorkit_1.6.1.bb)0
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb (renamed from dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-pca9685_3.3.4.bb)0
-rw-r--r--img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.pngbin0 -> 277209 bytes
-rw-r--r--img/balena.pngbin7307 -> 5841 bytes
-rw-r--r--kas-poky-rpi.yml8
-rw-r--r--recipes-bsp/bootfiles/rpi-bootfiles.bb11
-rw-r--r--recipes-bsp/bootfiles/rpi-cmdline.bb10
-rw-r--r--recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch55
-rw-r--r--recipes-bsp/bootfiles/rpi-config_git.bb40
-rw-r--r--recipes-bsp/common/raspberrypi-firmware.inc4
-rw-r--r--recipes-bsp/common/raspberrypi-tools.inc4
-rw-r--r--recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb69
-rw-r--r--recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in4
-rw-r--r--recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb3
-rw-r--r--recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch2
-rw-r--r--recipes-bsp/u-boot/u-boot_%.bbappend3
-rw-r--r--recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch13
-rw-r--r--recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch16
-rw-r--r--recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch13
-rw-r--r--recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch25
-rw-r--r--recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch31
-rw-r--r--recipes-connectivity/bluez5/bluez5_%.bbappend2
-rw-r--r--recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch2
-rw-r--r--recipes-core/psplash/files/framebuf.conf4
-rw-r--r--recipes-core/psplash/psplash_%.bbappend10
-rw-r--r--recipes-core/udev/udev-rules-rpi.bb7
-rw-r--r--recipes-core/udev/udev-rules-rpi/99-com.rules21
-rw-r--r--recipes-devtools/bcm2835/bcm2835_1.73.bb (renamed from recipes-devtools/bcm2835/bcm2835_1.71.bb)3
-rw-r--r--recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.10.bb (renamed from recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.8.bb)3
-rw-r--r--recipes-devtools/python/python3-adafruit-platformdetect_3.27.0.bb (renamed from recipes-devtools/python/python3-adafruit-platformdetect_3.22.1.bb)3
-rw-r--r--recipes-devtools/python/rpi-gpio/0001-setup.py-Use-setuptools-instead-of-distutils.patch28
-rw-r--r--recipes-devtools/python/rpi-gpio_0.7.0.bb20
-rw-r--r--recipes-devtools/python/rpi-gpio_0.7.1.bb15
-rw-r--r--recipes-graphics/mesa/mesa-demos_%.bbappend5
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch2
-rw-r--r--recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch2
-rw-r--r--recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch2
-rw-r--r--recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch2
-rw-r--r--recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch2
-rw-r--r--recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch2
-rw-r--r--recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch2
-rw-r--r--recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch2
-rw-r--r--recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch2
-rw-r--r--recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch2
-rw-r--r--recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch2
-rw-r--r--recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch2
-rw-r--r--recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch2
-rw-r--r--recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch2
-rw-r--r--recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch2
-rw-r--r--recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch2
-rw-r--r--recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch2
-rw-r--r--recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch2
-rw-r--r--recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch2
-rw-r--r--recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch2
-rw-r--r--recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch2
-rw-r--r--recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch725
-rw-r--r--recipes-graphics/userland/userland_git.bb5
-rw-r--r--recipes-graphics/wayland/weston_%.bbappend2
-rw-r--r--recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb20
-rw-r--r--recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch28
-rw-r--r--recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb176
-rw-r--r--recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch35
-rw-r--r--recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch50
-rw-r--r--recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch94
-rw-r--r--recipes-kernel/linux/files/default-cpu-governor.cfg9
-rw-r--r--recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg1
-rw-r--r--recipes-kernel/linux/files/rpi.scc1
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7.inc13
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb6
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb6
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb6
-rw-r--r--recipes-kernel/linux/linux-raspberrypi.inc10
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_5.10.bb19
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_5.15.bb19
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_6.1.bb31
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_6.6.bb31
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_%.bbappend (renamed from recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_1.20.%.bbappend)0
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer_git.bb8
-rw-r--r--recipes-multimedia/picamera-libs/picamera-libs.bb2
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch292
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch34
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch30
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch68341
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff22
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch82
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch111
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch45
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch35
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb198
-rw-r--r--wic/sdimage-raspberrypi.wks2
174 files changed, 86636 insertions, 721 deletions
diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..f3e3d70
--- /dev/null
+++ b/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,7 @@
+## Code of Conduct
+
+This project has adopted the [Contributor
+Covenant](https://www.contributor-covenant.org/). For details, see the full
+text [here](https://www.contributor-covenant.org/version/2/1/code_of_conduct/).
+For more information, additional questions or comments contact the project's
+maintainers.
diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml
index 35fac92..b91668e 100644
--- a/.github/actions/docker-build/action.yml
+++ b/.github/actions/docker-build/action.yml
@@ -17,7 +17,7 @@ runs:
steps:
- name: Build the ${{ inputs.docker_image }} docker image
shell: bash
- # We run this unconditinally even if the change doesn't touch the
+ # We run this unconditionally even if the change doesn't touch the
# relevant docker files because there is a chance that another PR (or
# something else) rebuilt the local image. For example if the first
# version of the PR included change for the relevant docker image but a
diff --git a/.github/workflows/cancel-redundant-workflows.yml b/.github/workflows/cancel-redundant-workflows.yml
index 45a7443..556317d 100644
--- a/.github/workflows/cancel-redundant-workflows.yml
+++ b/.github/workflows/cancel-redundant-workflows.yml
@@ -15,9 +15,9 @@ on:
jobs:
cancel-redundant-workflows:
- runs-on: [self-hosted, Linux]
+ runs-on: ubuntu-latest
steps:
- - uses: styfle/cancel-workflow-action@0.9.1
+ - uses: styfle/cancel-workflow-action@0.10.0
with:
all_but_latest: true
workflow_id: ${{ github.event.workflow.id }}
diff --git a/.github/workflows/compliance.yml b/.github/workflows/compliance.yml
index 35e4731..ec489f0 100644
--- a/.github/workflows/compliance.yml
+++ b/.github/workflows/compliance.yml
@@ -10,10 +10,10 @@ on:
jobs:
dco:
name: DCO
- runs-on: [self-hosted, Linux]
+ runs-on: ubuntu-latest
steps:
- name: Checkout the code
- uses: actions/checkout@v2
+ uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Build a temporary DCO image
@@ -36,10 +36,10 @@ jobs:
if: always()
reuse:
name: reuse
- runs-on: [self-hosted, Linux]
+ runs-on: ubuntu-latest
steps:
- name: Checkout the code
- uses: actions/checkout@v2
+ uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Do reuse check
diff --git a/.github/workflows/docker-images/dco-check/entrypoint.sh b/.github/workflows/docker-images/dco-check/entrypoint.sh
index 135d410..af2c507 100755
--- a/.github/workflows/docker-images/dco-check/entrypoint.sh
+++ b/.github/workflows/docker-images/dco-check/entrypoint.sh
@@ -16,6 +16,14 @@ GIT_REPO_PATH="/work"
[ -d "$GIT_REPO_PATH/.git" ] ||
error "Can't find a git checkout under $GIT_REPO_PATH ."
cd "$GIT_REPO_PATH"
+
+# The GitHub runner user and the container user might differ making git error
+# out with:
+# error: fatal: detected dubious ownership in repository at '/work'
+# Avoid this as the security risk is minimum here while guarding the git hooks
+# via PRs.
+git config --global --add safe.directory /work
+
dco-check \
--verbose \
--default-branch "origin/$BASE_REF"
diff --git a/.github/workflows/docker-images/yocto-builder/entrypoint-build.sh b/.github/workflows/docker-images/yocto-builder/entrypoint-build.sh
index a98fa2a..65999d0 100755
--- a/.github/workflows/docker-images/yocto-builder/entrypoint-build.sh
+++ b/.github/workflows/docker-images/yocto-builder/entrypoint-build.sh
@@ -42,10 +42,13 @@ printf "\n# ------ ci ------\n" >> conf/local.conf
cat <<EOCONF >>conf/local.conf
BB_NUMBER_THREADS = "6"
PARALLEL_MAKE = "-j 6"
-DISTRO_FEATURES:append = " systemd"
+# unmerged-usr is deprecated
+# https://lore.kernel.org/all/3f2f03085301d22854e5429019fb010f27d98bc7.camel@linuxfoundation.org/t/
+DISTRO_FEATURES:append = " systemd usrmerge"
VIRTUAL-RUNTIME_init_manager = "systemd"
DISTRO_FEATURES_BACKFILL_CONSIDERED:append = " sysvinit"
VIRTUAL-RUNTIME_initscripts = "systemd-compat-units"
+LICENSE_FLAGS_ACCEPTED = "synaptics-killswitch"
EOCONF
# Add the BSP layer
diff --git a/.github/workflows/mirror.yml b/.github/workflows/mirror.yml
index 11bb185..d9e3cde 100644
--- a/.github/workflows/mirror.yml
+++ b/.github/workflows/mirror.yml
@@ -12,11 +12,11 @@ concurrency:
jobs:
yocto-mirror:
name: Yocto Git Mirror
- runs-on: [self-hosted, Linux]
+ runs-on: ubuntu-latest
steps:
- - uses: agherzan/git-mirror-me-action@v1.0.0
+ - uses: agherzan/git-mirror-me-action@11f54c7186724daafbe5303b5075954f1a19a63e
env:
- SSH_PRIVATE_KEY: ${{ secrets.YOCTO_META_RASPBERRYPI_SSH_PRIVATE_KEY }}
- SSH_KNOWN_HOSTS: ${{ secrets.YOCTO_META_RASPBERRYPI_SSH_KNOWN_HOSTS }}
- with:
- destination-repository: "git@push.yoctoproject.org:meta-raspberrypi"
+ GMM_SSH_PRIVATE_KEY: ${{ secrets.YOCTO_META_RASPBERRYPI_SSH_PRIVATE_KEY }}
+ GMM_SSH_KNOWN_HOSTS: ${{ secrets.YOCTO_META_RASPBERRYPI_SSH_KNOWN_HOSTS }}
+ GMM_DST_REPO: "ssh://git@push.yoctoproject.org/meta-raspberrypi"
+ GMM_DEBUG: "1"
diff --git a/.github/workflows/yocto-builds.yml b/.github/workflows/yocto-builds.yml
index 3dba5e9..408d25e 100644
--- a/.github/workflows/yocto-builds.yml
+++ b/.github/workflows/yocto-builds.yml
@@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: MIT
+---
+
name: Builds
on:
@@ -23,25 +25,45 @@ jobs:
- raspberrypi3
- raspberrypi4-64
- raspberrypi4
+ - raspberrypi5
- raspberrypi-cm3
- raspberrypi-cm
+ - raspberrypi-armv7
+ - raspberrypi-armv8
image: [rpi-test-image]
distro: [poky]
runs-on: [self-hosted, Linux]
name: ${{ matrix.machine }}/${{ matrix.image }}/poky/systemd
env:
- DL_DIR: /var/lib/ci/yocto/downloads
- SSTATE_DIR: /var/lib/ci/yocto/sstate
+ DL_DIR: /var/lib/ci/yocto/downloads
+ SSTATE_DIR: /var/lib/ci/yocto/sstate
steps:
- name: Checkout the code
- uses: actions/checkout@v2
+ uses: actions/checkout@v3
with:
fetch-depth: 0
+ - name: Define Yocto build files
+ id: changed-files-specific
+ uses: tj-actions/changed-files@v24
+ with:
+ files: |
+ .github/actions/**
+ .github/workflows/docker-images/yocto-builder/**
+ .github/workflows/docker-images/*.sh
+ .github/workflows/yocto-builds.yml
+ classes/**
+ conf/**
+ dynamic-layers/**
+ files/**
+ lib/**
+ recipes-**
+ wic/**
- name: Build a temporary yocto-builder image
uses: ./.github/actions/docker-build
with:
docker_image: yocto-builder
id: ${{ github.event.number }}
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
- name: Build the image
run: |
docker run --rm \
@@ -56,6 +78,7 @@ jobs:
--env "SSTATE_DIR=$SSTATE_DIR" \
"yocto-builder-${{ github.event.number }}" \
/entrypoint-build.sh
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
- name: Cleanup temporary docker image
uses: ./.github/actions/docker-clean-image
with:
diff --git a/.github/workflows/yocto-layer.yml b/.github/workflows/yocto-layer.yml
index 3d5baf8..fa11815 100644
--- a/.github/workflows/yocto-layer.yml
+++ b/.github/workflows/yocto-layer.yml
@@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: MIT
+---
+
name: Yocto Compatible
on:
@@ -12,26 +14,44 @@ jobs:
name: Validate with yocto-check-layer
runs-on: [self-hosted, Linux]
steps:
- - name: Checkout the code
- uses: actions/checkout@v2
- with:
- fetch-depth: 0
- - name: Build a temporary yocto-builder image
- uses: ./.github/actions/docker-build
- with:
- docker_image: yocto-builder
- id: ${{ github.event.number }}
- - name: Run yocto-check-layer
- run: |
- docker run --rm -v "$GITHUB_WORKSPACE:/work:ro" \
- --env "BASE_REF=$GITHUB_BASE_REF" \
- "yocto-builder-${{ github.event.number }}" \
- /entrypoint-yocto-check-layer.sh
- - name: Cleanup temporary docker image
- uses: ./.github/actions/docker-clean-image
- with:
- docker_image: yocto-builder-${{ github.event.number }}
- if: always()
- - name: Cleanup dangling docker images
- uses: ./.github/actions/docker-clean-dangling
- if: always()
+ - name: Checkout the code
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Define Yocto build files
+ id: changed-files-specific
+ uses: tj-actions/changed-files@v24
+ with:
+ files: |
+ .github/actions/**
+ .github/workflows/docker-images/yocto-builder/**
+ .github/workflows/docker-images/*.sh
+ .github/workflows/yocto-builds.yml
+ classes/**
+ conf/**
+ dynamic-layers/**
+ files/**
+ lib/**
+ recipes-**
+ wic/**
+ - name: Build a temporary yocto-builder image
+ uses: ./.github/actions/docker-build
+ with:
+ docker_image: yocto-builder
+ id: ${{ github.event.number }}
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
+ - name: Run yocto-check-layer
+ run: |
+ docker run --rm -v "$GITHUB_WORKSPACE:/work:ro" \
+ --env "BASE_REF=$GITHUB_BASE_REF" \
+ "yocto-builder-${{ github.event.number }}" \
+ /entrypoint-yocto-check-layer.sh
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
+ - name: Cleanup temporary docker image
+ uses: ./.github/actions/docker-clean-image
+ with:
+ docker_image: yocto-builder-${{ github.event.number }}
+ if: always()
+ - name: Cleanup dangling docker images
+ uses: ./.github/actions/docker-clean-dangling
+ if: always()
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 454f385..5e8dc20 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,6 +1,9 @@
version: 2
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.7"
python:
- version: "3.7"
install:
- requirements: docs/requirements.txt
diff --git a/README.md b/README.md
index 712b9a2..f00d151 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,22 @@ Yocto BSP layer for the Raspberry Pi boards - <http://www.raspberrypi.org/>.
[![Documentation Status](https://readthedocs.org/projects/meta-raspberrypi/badge/?version=latest)](https://meta-raspberrypi.readthedocs.io/en/latest/?badge=latest)
[![Matrix](https://img.shields.io/badge/chat-meta--raspberrypi-brightgreen)](https://matrix.to/#/#meta-raspberrypi:matrix.org)
-| | |
-|:-: | :-: |
-| Build server sponsored by | [balena.io](https://www.balena.io/) |
+<table border="0" rules="none">
+<tr border="0">
+<td width="140" height="100" align="center">
+ <br />
+ <a href="https://www.yoctoproject.org/ecosystem/branding/">
+ <img alt="Yocto Project Layer Compatible" src="img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png">
+ </a>
+</td>
+<td width="150" height="100" align="center">
+ Sponsored by:<br />
+ <a href="https://balena.io">
+ <img alt="balena.io" src="img/balena.png">
+ </a>
+</td>
+</tr>
+</table>
## Quick links
@@ -31,6 +44,12 @@ OpenEmbedded/Yocto distributions and layer stacks, such as:
* Yoe Disto (Video and Camera Products).
* Yocto/Poky (main focus of testing).
+## Yocto Project Compatible Layer
+
+This layer is officially approved as part of the `Yocto Project Compatible
+Layers Program`. You can find details of that on the official Yocto Project
+[website](https://www.yoctoproject.org/software-overview/layers/?searchTerm=meta-raspberrypi).
+
## Dependencies
This layer depends on:
diff --git a/classes/sdcard_image-rpi.bbclass b/classes/sdcard_image-rpi.bbclass
index a7b9ac8..ddcd69d 100644
--- a/classes/sdcard_image-rpi.bbclass
+++ b/classes/sdcard_image-rpi.bbclass
@@ -25,11 +25,6 @@ inherit image_types
# This image depends on the rootfs image
IMAGE_TYPEDEP:rpi-sdimg = "${SDIMG_ROOTFS_TYPE}"
-# Kernel image name
-SDIMG_KERNELIMAGE:raspberrypi ?= "kernel.img"
-SDIMG_KERNELIMAGE:raspberrypi2 ?= "kernel7.img"
-SDIMG_KERNELIMAGE:raspberrypi3-64 ?= "kernel8.img"
-
# Boot partition volume id
# Shorten raspberrypi to just rpi to keep it under 11 characters
# now enforced by mkfs.vfat from dosfstools-4.2
@@ -65,7 +60,7 @@ do_image_rpi_sdimg[depends] = " \
do_image_rpi_sdimg[recrdeps] = "do_build"
# SD card image name
-SDIMG = "${IMGDEPLOYDIR}/${IMAGE_NAME}${IMAGE_NAME_SUFFIX}.rpi-sdimg"
+SDIMG = "${IMGDEPLOYDIR}/${IMAGE_NAME}.rpi-sdimg"
# Additional files and/or directories to be copied into the vfat partition from the IMAGE_ROOTFS.
FATPAYLOAD ?= ""
diff --git a/conf/layer.conf b/conf/layer.conf
index d7ad4ed..9488ac9 100644
--- a/conf/layer.conf
+++ b/conf/layer.conf
@@ -9,7 +9,7 @@ BBFILE_COLLECTIONS += "raspberrypi"
BBFILE_PATTERN_raspberrypi := "^${LAYERDIR}/"
BBFILE_PRIORITY_raspberrypi = "9"
-LAYERSERIES_COMPAT_raspberrypi = "kirkstone"
+LAYERSERIES_COMPAT_raspberrypi = "nanbield scarthgap"
LAYERDEPENDS_raspberrypi = "core"
# Additional license directories.
diff --git a/conf/machine/include/rpi-base.inc b/conf/machine/include/rpi-base.inc
index a2edf06..a5fd1a4 100644
--- a/conf/machine/include/rpi-base.inc
+++ b/conf/machine/include/rpi-base.inc
@@ -18,6 +18,7 @@ RPI_KERNEL_DEVICETREE_OVERLAYS ?= " \
overlays/overlay_map.dtb \
overlays/at86rf233.dtbo \
overlays/disable-bt.dtbo \
+ overlays/disable-wifi.dtbo \
overlays/dwc2.dtbo \
overlays/gpio-ir.dtbo \
overlays/gpio-ir-tx.dtbo \
@@ -25,8 +26,16 @@ RPI_KERNEL_DEVICETREE_OVERLAYS ?= " \
overlays/gpio-poweroff.dtbo \
overlays/gpio-shutdown.dtbo \
overlays/hifiberry-amp.dtbo \
+ overlays/hifiberry-amp100.dtbo \
+ overlays/hifiberry-amp3.dtbo \
+ overlays/hifiberry-amp4pro.dtbo \
overlays/hifiberry-dac.dtbo \
overlays/hifiberry-dacplus.dtbo \
+ overlays/hifiberry-dacplusadc.dtbo \
+ overlays/hifiberry-dacplusadcpro.dtbo \
+ overlays/hifiberry-dacplusdsp.dtbo \
+ overlays/hifiberry-dacplushd.dtbo \
+ overlays/hifiberry-digi-pro.dtbo \
overlays/hifiberry-digi.dtbo \
overlays/justboom-both.dtbo \
overlays/justboom-dac.dtbo \
@@ -35,6 +44,7 @@ RPI_KERNEL_DEVICETREE_OVERLAYS ?= " \
overlays/i2c-rtc.dtbo \
overlays/imx219.dtbo \
overlays/imx477.dtbo \
+ overlays/imx708.dtbo \
overlays/iqaudio-dac.dtbo \
overlays/iqaudio-dacplus.dtbo \
overlays/mcp2515-can0.dtbo \
@@ -52,6 +62,7 @@ RPI_KERNEL_DEVICETREE_OVERLAYS ?= " \
overlays/vc4-fkms-v3d-pi4.dtbo \
overlays/vc4-kms-v3d.dtbo \
overlays/vc4-kms-v3d-pi4.dtbo \
+ overlays/vc4-kms-v3d-pi5.dtbo \
overlays/vc4-kms-dsi-7inch.dtbo \
overlays/w1-gpio.dtbo \
overlays/w1-gpio-pullup.dtbo \
@@ -59,20 +70,23 @@ RPI_KERNEL_DEVICETREE_OVERLAYS ?= " \
"
RPI_KERNEL_DEVICETREE ?= " \
- bcm2708-rpi-zero.dtb \
- bcm2708-rpi-zero-w.dtb \
- bcm2708-rpi-b.dtb \
- bcm2708-rpi-b-rev1.dtb \
- bcm2708-rpi-b-plus.dtb \
- bcm2709-rpi-2-b.dtb \
- bcm2710-rpi-2-b.dtb \
- bcm2710-rpi-3-b.dtb \
- bcm2710-rpi-3-b-plus.dtb \
- bcm2711-rpi-4-b.dtb \
- bcm2711-rpi-400.dtb \
- bcm2708-rpi-cm.dtb \
- bcm2710-rpi-cm3.dtb \
- bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2708-rpi-zero.dtb \
+ broadcom/bcm2708-rpi-zero-w.dtb \
+ broadcom/bcm2708-rpi-b.dtb \
+ broadcom/bcm2708-rpi-b-rev1.dtb \
+ broadcom/bcm2708-rpi-b-plus.dtb \
+ broadcom/bcm2709-rpi-2-b.dtb \
+ broadcom/bcm2710-rpi-2-b.dtb \
+ broadcom/bcm2710-rpi-3-b.dtb \
+ broadcom/bcm2710-rpi-3-b-plus.dtb \
+ broadcom/bcm2710-rpi-zero-2.dtb \
+ broadcom/bcm2711-rpi-4-b.dtb \
+ broadcom/bcm2711-rpi-400.dtb \
+ broadcom/bcm2708-rpi-cm.dtb \
+ broadcom/bcm2710-rpi-cm3.dtb \
+ broadcom/bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2711-rpi-cm4s.dtb \
+ broadcom/bcm2712-rpi-5-b.dtb \
"
KERNEL_DEVICETREE ??= " \
@@ -109,14 +123,15 @@ SERIAL_CONSOLES_CHECK ??= "${SERIAL_CONSOLES}"
# This variable is referred to by recipes fetching / generating the files.
BOOTFILES_DIR_NAME ?= "bootfiles"
-# Set Raspberrypi splash image
-SPLASH ?= "psplash-raspberrypi"
-
def make_dtb_boot_files(d):
# Generate IMAGE_BOOT_FILES entries for device tree files listed in
# KERNEL_DEVICETREE.
alldtbs = d.getVar('KERNEL_DEVICETREE')
+ # DTBs may be built out of kernel with devicetree.bbclass
+ if not alldtbs:
+ return ''
+
def transform(dtb):
base = os.path.basename(dtb)
if dtb.endswith('dtbo') or base == 'overlay_map.dtb':
@@ -133,14 +148,21 @@ def make_dtb_boot_files(d):
return ' '.join([transform(dtb) for dtb in alldtbs.split(' ') if dtb])
+RPI_EXTRA_IMAGE_BOOT_FILES ?= " \
+ ${@bb.utils.contains('RPI_USE_U_BOOT', '1', \
+ '${KERNEL_IMAGETYPE} u-boot.bin;${SDIMG_KERNELIMAGE} boot.scr', \
+ '${KERNEL_IMAGETYPE};${SDIMG_KERNELIMAGE}', d)} \
+ "
IMAGE_BOOT_FILES ?= "${BOOTFILES_DIR_NAME}/* \
${@make_dtb_boot_files(d)} \
- ${@bb.utils.contains('RPI_USE_U_BOOT', '1', \
- '${KERNEL_IMAGETYPE} u-boot.bin;${SDIMG_KERNELIMAGE} boot.scr', \
- '${KERNEL_IMAGETYPE};${SDIMG_KERNELIMAGE}', d)} \
+ ${RPI_EXTRA_IMAGE_BOOT_FILES} \
"
+
+EXTRA_IMAGEDEPENDS += "rpi-bootfiles"
+
do_image_wic[depends] += " \
+ virtual/kernel:do_deploy \
rpi-bootfiles:do_deploy \
${@bb.utils.contains('RPI_USE_U_BOOT', '1', 'u-boot:do_deploy', '',d)} \
"
diff --git a/conf/machine/include/rpi-default-providers.inc b/conf/machine/include/rpi-default-providers.inc
index c02d248..3f81026 100644
--- a/conf/machine/include/rpi-default-providers.inc
+++ b/conf/machine/include/rpi-default-providers.inc
@@ -7,6 +7,10 @@ PREFERRED_PROVIDER_virtual/libgles2 ?= "${@bb.utils.contains("MACHINE_FEATURES",
PREFERRED_PROVIDER_virtual/libgl ?= "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "mesa", "mesa-gl", d)}"
PREFERRED_PROVIDER_virtual/mesa ?= "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "mesa", "mesa-gl", d)}"
PREFERRED_PROVIDER_virtual/libgbm ?= "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "mesa", "mesa-gl", d)}"
+PREFERRED_PROVIDER_vlc ?= "rpidistro-vlc"
+PREFERRED_PROVIDER_ffmpeg ?= "rpidistro-ffmpeg"
+PREFERRED_PROVIDER_libav ?= "rpidistro-ffmpeg"
+PREFERRED_PROVIDER_libpostproc ?= "rpidistro-ffmpeg"
PREFERRED_PROVIDER_jpeg ?= "jpeg"
PREFERRED_PROVIDER_virtual/libomxil ?= "userland"
diff --git a/conf/machine/include/rpi-default-versions.inc b/conf/machine/include/rpi-default-versions.inc
index 8ff2839..6def274 100644
--- a/conf/machine/include/rpi-default-versions.inc
+++ b/conf/machine/include/rpi-default-versions.inc
@@ -1,3 +1,4 @@
# RaspberryPi BSP default versions
-PREFERRED_VERSION_linux-raspberrypi ??= "5.15.%"
+PREFERRED_VERSION_linux-raspberrypi ??= "6.6.%"
+PREFERRED_VERSION_linux-raspberrypi-v7 ??= "${PREFERRED_VERSION_linux-raspberrypi}"
diff --git a/conf/machine/raspberrypi-armv7.conf b/conf/machine/raspberrypi-armv7.conf
new file mode 100644
index 0000000..cb2e5a2
--- /dev/null
+++ b/conf/machine/raspberrypi-armv7.conf
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+#@TYPE: Machine
+#@NAME: RaspberryPi Development Boards (32bit)
+#@DESCRIPTION: Machine configuration for the RaspberryPi boards in 32 bit mode
+
+DEFAULTTUNE ?= "cortexa7thf-neon-vfpv4"
+require conf/machine/include/arm/armv7a/tune-cortexa7.inc
+include conf/machine/include/rpi-base.inc
+
+# This machine includes by default the kernel for v7l. We hook in support for
+# v7.
+RASPBERRYPI_v7_KERNEL = "linux-raspberrypi-v7"
+RASPBERRYPI_v7_KERNEL_PACKAGE_NAME = "kernel-v7"
+RASPBERRYPI_v7_KERNEL_FILE ?= "kernel7.img"
+# We don't need a lot for v7l because it is the default provider,
+# virtual/kernel.
+RASPBERRYPI_v7l_KERNEL_FILE ?= "kernel7l.img"
+
+MACHINE_FEATURES += "pci"
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43430 \
+ linux-firmware-rpidistro-bcm43436 \
+ linux-firmware-rpidistro-bcm43436s \
+ linux-firmware-rpidistro-bcm43455 \
+ linux-firmware-rpidistro-bcm43456 \
+ bluez-firmware-rpidistro-bcm43430a1-hcd \
+ bluez-firmware-rpidistro-bcm43430b0-hcd \
+ bluez-firmware-rpidistro-bcm4345c0-hcd \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
+"
+
+# FIXME: This machine doesn't support u-boot (yet)
+RPI_EXTRA_IMAGE_BOOT_FILES = " \
+ ${KERNEL_IMAGETYPE};${RASPBERRYPI_v7l_KERNEL_FILE} \
+ ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}/${KERNEL_IMAGETYPE};${RASPBERRYPI_v7_KERNEL_FILE} \
+"
diff --git a/conf/machine/raspberrypi-armv8.conf b/conf/machine/raspberrypi-armv8.conf
new file mode 100644
index 0000000..0128bdc
--- /dev/null
+++ b/conf/machine/raspberrypi-armv8.conf
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+#@TYPE: Machine
+#@NAME: RaspberryPi Development Boards (64bit)
+#@DESCRIPTION: Machine configuration for the RaspberryPi boards in 64 bit mode
+
+require conf/machine/include/arm/armv8a/tune-cortexa53.inc
+include conf/machine/include/rpi-base.inc
+
+MACHINE_FEATURES += "pci"
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43430 \
+ linux-firmware-rpidistro-bcm43455 \
+ linux-firmware-rpidistro-bcm43456 \
+ linux-firmware-rpidistro-bcm43436 \
+ linux-firmware-rpidistro-bcm43436s \
+ bluez-firmware-rpidistro-bcm43430a1-hcd \
+ bluez-firmware-rpidistro-bcm43430b0-hcd \
+ bluez-firmware-rpidistro-bcm4345c0-hcd \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
+"
+
+RPI_KERNEL_DEVICETREE = " \
+ broadcom/bcm2710-rpi-3-b.dtb \
+ broadcom/bcm2710-rpi-3-b-plus.dtb \
+ broadcom/bcm2837-rpi-3-b.dtb \
+ broadcom/bcm2710-rpi-cm3.dtb \
+ broadcom/bcm2710-rpi-zero-2.dtb \
+ broadcom/bcm2711-rpi-4-b.dtb \
+ broadcom/bcm2711-rpi-400.dtb \
+ broadcom/bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2711-rpi-cm4s.dtb \
+ broadcom/bcm2712-rpi-5-b.dtb \
+"
+
+SDIMG_KERNELIMAGE ?= "kernel8.img"
+KERNEL_IMAGETYPE_UBOOT ?= "Image"
+KERNEL_IMAGETYPE_DIRECT ?= "Image"
+KERNEL_BOOTCMD ?= "booti"
+UBOOT_MACHINE = "rpi_arm64_config"
+SERIAL_CONSOLES ?= "115200;ttyS0"
+
+VC4DTBO ?= "vc4-fkms-v3d"
diff --git a/conf/machine/raspberrypi-cm.conf b/conf/machine/raspberrypi-cm.conf
index f9371df..365d030 100644
--- a/conf/machine/raspberrypi-cm.conf
+++ b/conf/machine/raspberrypi-cm.conf
@@ -2,7 +2,7 @@
#@NAME: RaspberryPi Compute Module (CM1)
#@DESCRIPTION: Machine configuration for the RaspberryPi Compute Module (CM1)
-MACHINEOVERRIDES = "raspberrypi:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi:"
include conf/machine/raspberrypi.conf
ARMSTUB ?= "armstub.bin"
diff --git a/conf/machine/raspberrypi.conf b/conf/machine/raspberrypi.conf
index b23687b..05263d7 100644
--- a/conf/machine/raspberrypi.conf
+++ b/conf/machine/raspberrypi.conf
@@ -7,8 +7,8 @@ DEFAULTTUNE ?= "arm1176jzfshf"
require conf/machine/include/tune-arm1176jzf-s.inc
include conf/machine/include/rpi-base.inc
-SERIAL_CONSOLES ?= "115200;ttyAMA0"
-
+SDIMG_KERNELIMAGE ?= "kernel.img"
UBOOT_MACHINE = "rpi_config"
+SERIAL_CONSOLES ?= "115200;ttyAMA0"
ARMSTUB ?= "armstub.bin"
diff --git a/conf/machine/raspberrypi0-2w-64.conf b/conf/machine/raspberrypi0-2w-64.conf
index 8e4729a..0264107 100644
--- a/conf/machine/raspberrypi0-2w-64.conf
+++ b/conf/machine/raspberrypi0-2w-64.conf
@@ -2,9 +2,9 @@
#@NAME: RaspberryPi0 2 Wifi Development Board
#@DESCRIPTION: Machine configuration for the RaspberryPi0 2 Wifi in 64 bits mode
-include conf/machine/raspberrypi3-64.conf
+MACHINEOVERRIDES =. "raspberrypi3-64:"
-MACHINEOVERRIDES := "${@'${MACHINEOVERRIDES}'.replace(':${MACHINE}',':raspberrypi3-64:${MACHINE}')}"
+include conf/machine/raspberrypi3-64.conf
MACHINE_EXTRA_RRECOMMENDS += "\
linux-firmware-rpidistro-bcm43436 \
diff --git a/conf/machine/raspberrypi0-2w.conf b/conf/machine/raspberrypi0-2w.conf
index c360d90..f3a4c4d 100644
--- a/conf/machine/raspberrypi0-2w.conf
+++ b/conf/machine/raspberrypi0-2w.conf
@@ -2,16 +2,12 @@
#@NAME: RaspberryPi0 2 Wifi Development Board
#@DESCRIPTION: Machine configuration for the RaspberryPi0 2 Wifi in 32 bits mode
-include conf/machine/raspberrypi3.conf
+MACHINEOVERRIDES =. "raspberrypi3:"
-MACHINEOVERRIDES := "${@'${MACHINEOVERRIDES}'.replace(':${MACHINE}',':raspberrypi3:${MACHINE}')}"
+include conf/machine/raspberrypi3.conf
MACHINE_EXTRA_RRECOMMENDS += "\
linux-firmware-rpidistro-bcm43436 \
linux-firmware-rpidistro-bcm43436s \
bluez-firmware-rpidistro-bcm43430b0-hcd \
"
-
-RPI_KERNEL_DEVICETREE = " \
- bcm2710-rpi-zero-2.dtb \
- "
diff --git a/conf/machine/raspberrypi0.conf b/conf/machine/raspberrypi0.conf
index 80297b5..597918a 100644
--- a/conf/machine/raspberrypi0.conf
+++ b/conf/machine/raspberrypi0.conf
@@ -2,7 +2,7 @@
#@NAME: RaspberryPi Zero Development Board
#@DESCRIPTION: Machine configuration for the RaspberryPi Zero board (https://www.raspberrypi.org/blog/raspberry-pi-zero)
-MACHINEOVERRIDES = "raspberrypi:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi:"
include conf/machine/raspberrypi.conf
SERIAL_CONSOLES ?= "115200;ttyAMA0"
diff --git a/conf/machine/raspberrypi2.conf b/conf/machine/raspberrypi2.conf
index 403d15e..8cb859e 100644
--- a/conf/machine/raspberrypi2.conf
+++ b/conf/machine/raspberrypi2.conf
@@ -7,8 +7,8 @@ DEFAULTTUNE ?= "cortexa7thf-neon-vfpv4"
require conf/machine/include/arm/armv7a/tune-cortexa7.inc
include conf/machine/include/rpi-base.inc
+SDIMG_KERNELIMAGE ?= "kernel7.img"
SERIAL_CONSOLES ?= "115200;ttyAMA0"
-
UBOOT_MACHINE = "rpi_2_config"
ARMSTUB ?= "armstub7.bin"
diff --git a/conf/machine/raspberrypi3-64.conf b/conf/machine/raspberrypi3-64.conf
index 95475f3..50dd533 100644
--- a/conf/machine/raspberrypi3-64.conf
+++ b/conf/machine/raspberrypi3-64.conf
@@ -2,7 +2,7 @@
#@NAME: RaspberryPi 3 Development Board
#@DESCRIPTION: Machine configuration for the RaspberryPi 3 in 64 bits mode
-MACHINEOVERRIDES = "raspberrypi3:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi3:"
MACHINE_EXTRA_RRECOMMENDS += "\
linux-firmware-rpidistro-bcm43430 \
@@ -21,16 +21,15 @@ RPI_KERNEL_DEVICETREE = " \
broadcom/bcm2710-rpi-cm3.dtb \
"
-SERIAL_CONSOLES ?= "115200;ttyS0"
-
-UBOOT_MACHINE = "rpi_arm64_config"
-
+SDIMG_KERNELIMAGE ?= "kernel8.img"
# When u-boot is enabled we need to use the "Image" format and the "booti"
# command to load the kernel
KERNEL_IMAGETYPE_UBOOT ?= "Image"
# "zImage" not supported on arm64 and ".gz" images not supported by bootloader yet
KERNEL_IMAGETYPE_DIRECT ?= "Image"
KERNEL_BOOTCMD ?= "booti"
+UBOOT_MACHINE = "rpi_arm64_config"
+SERIAL_CONSOLES ?= "115200;ttyS0"
VC4DTBO ?= "vc4-fkms-v3d"
ARMSTUB ?= "armstub8.bin"
diff --git a/conf/machine/raspberrypi4-64.conf b/conf/machine/raspberrypi4-64.conf
index 0cf7d51..42ed4be 100644
--- a/conf/machine/raspberrypi4-64.conf
+++ b/conf/machine/raspberrypi4-64.conf
@@ -2,7 +2,7 @@
#@NAME: RaspberryPi 4 Development Board (64bit)
#@DESCRIPTION: Machine configuration for the RaspberryPi 4 in 64 bits mode
-MACHINEOVERRIDES = "raspberrypi4:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi4:"
MACHINE_FEATURES += "pci"
MACHINE_EXTRA_RRECOMMENDS += "\
@@ -12,8 +12,6 @@ MACHINE_EXTRA_RRECOMMENDS += "\
bluez-firmware-rpidistro-bcm4345c5-hcd \
"
-DEFAULTTUNE = "cortexa72"
-
require conf/machine/include/arm/armv8a/tune-cortexa72.inc
include conf/machine/include/rpi-base.inc
@@ -21,6 +19,7 @@ RPI_KERNEL_DEVICETREE = " \
broadcom/bcm2711-rpi-4-b.dtb \
broadcom/bcm2711-rpi-400.dtb \
broadcom/bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2711-rpi-cm4s.dtb \
"
SDIMG_KERNELIMAGE ?= "kernel8.img"
diff --git a/conf/machine/raspberrypi5.conf b/conf/machine/raspberrypi5.conf
new file mode 100644
index 0000000..8c38637
--- /dev/null
+++ b/conf/machine/raspberrypi5.conf
@@ -0,0 +1,26 @@
+#@TYPE: Machine
+#@NAME: RaspberryPi 5 Development Board (64bit)
+#@DESCRIPTION: Machine configuration for the RaspberryPi 5 in 64 bits mode
+
+require conf/machine/include/arm/armv8-2a/tune-cortexa76.inc
+include conf/machine/include/rpi-base.inc
+
+MACHINE_FEATURES += "pci"
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43455 \
+ bluez-firmware-rpidistro-bcm4345c0-hcd \
+ linux-firmware-rpidistro-bcm43456 \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
+"
+
+RPI_KERNEL_DEVICETREE = " \
+ broadcom/bcm2712-rpi-5-b.dtb \
+"
+
+SDIMG_KERNELIMAGE ?= "kernel_2712.img"
+SERIAL_CONSOLES ?= "115200;ttyAMA10"
+
+VC4DTBO ?= "vc4-kms-v3d"
+
+# "zImage" not supported on arm64 and ".gz" images not supported by bootloader yet
+KERNEL_IMAGETYPE_DIRECT ?= "Image"
diff --git a/docs/conf.py b/docs/conf.py
index e7a2491..39e7223 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -30,7 +30,10 @@
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
-extensions = ['myst_parser']
+extensions = [
+ 'myst_parser',
+ 'sphinx_rtd_theme'
+]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@@ -121,6 +124,7 @@ todo_include_todos = False
# a list of builtin themes.
#
# html_theme = 'alabaster'
+html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
diff --git a/docs/extra-build-config.md b/docs/extra-build-config.md
index 9f14996..dfb86f1 100644
--- a/docs/extra-build-config.md
+++ b/docs/extra-build-config.md
@@ -74,6 +74,22 @@ To remove (or adjust) this delay set these variables in local.conf:
BOOT_DELAY = "0"
BOOT_DELAY_MS = "0"
+## Boot media
+
+The Raspberry Pi 4 board can load the boot image files from SD card and USB memory.
+By default SD card media is used as boot media.
+
+To switch the boot media from SD card to USB memory, the following variables are supported
+in local.conf: `CMDLINE_ROOT_PARTITION` and `BOOT_MEDIA`.
+The default value of `CMDLINE_ROOT_PARTITION` is "/dev/mmcblk0p2" to mount SD card. If you want to mount USB memory partition, set CMDLINE_ROOT_PARTITION to "/dev/sda2".
+`BOOT_MEDIA` allows `mmc` and `usb`. The "mmc" is required to load an image from the SD card, following the u-boot specification. Similarly, if you want to load a boot image file from USB memory, set BOOT_MEDIA to "usb".
+
+For example, if you want to use USB boot, please define
+the following parameters in your local.conf file.
+
+ CMDLINE_ROOT_PARTITION = "/dev/sda2"
+ BOOT_MEDIA = "usb"
+
## Set overclocking options
The Raspberry Pi can be overclocked. As of now overclocking up to the "Turbo
@@ -155,6 +171,16 @@ For further customisation the KERNEL_IMAGETYPE and KERNEL_BOOTCMD variables can
be overridden to select the exact kernel image type (eg. zImage) and u-boot
command (eg. bootz) to be used.
+To operate correctly, U-Boot requires `enable_uart=1` in `config.txt` file for
+the following boards:
+* Raspberry Pi Zero W
+* Raspberry Pi 3 32-bit
+* Raspberry Pi 3 64-bit
+* Raspberry Pi 4 32-bit
+* Raspberry Pi 4 64-bit
+It means that, for those boards, `RPI_USE_U_BOOT = "1"` is not compatible with
+`ENABLE_UART = "0"`.
+
## Image with Initramfs
To build an initramfs image:
@@ -170,7 +196,7 @@ To build an initramfs image:
- `INITRAMFS_IMAGE_BUNDLE = "1"`
- `BOOT_SPACE = "1073741"`
- `INITRAMFS_MAXSIZE = "315400"`
- - `IMAGE_FSTYPES_pn-${INITRAMFS_IMAGE} = "${INITRAMFS_FSTYPES}"`
+ - `IMAGE_FSTYPES:pn-${INITRAMFS_IMAGE} = "${INITRAMFS_FSTYPES}"`
## Including additional files in the SD card image boot partition
@@ -314,6 +340,13 @@ Some modules may require setting the frequency of the crystal oscillator used on
CAN_OSCILLATOR="8000000"
+Configure the interrupt pin to the one connected to the CAN module. By default,
+the pins are set to 25 for can0 and 24 for can1. To change them to 12 and 16,
+the following variables also have to be set:
+
+ CAN0_INTERRUPT_PIN = "12"
+ CAN1_INTERRUPT_PIN = "16"
+
Tested modules:
* PiCAN2 (16 MHz crystal): <http://skpang.co.uk/catalog/pican2-canbus-board-for-raspberry-pi-23-p-1475.html>
@@ -379,39 +412,43 @@ option:
# Raspberry Pi 7\" display/touch screen \n \
lcd_rotate=2 \n \
'
-## Enable Raspberrypi Camera V2
+## Enable Raspberry Pi Camera Module
-RaspberryPi does not have the unicam device ( RaspberryPi Camera ) enabled by default.
+Raspberry Pi does not have the unicam device ( Raspberry Pi Camera ) enabled by default.
Because this unicam device ( bcm2835-unicam ) as of now is used by libcamera opensource.
-So we have to explicitly set in local.conf.
+So we have to explicitly enable it in local.conf.
RASPBERRYPI_CAMERA_V2 = "1"
-This will add the device tree overlays imx219 ( RaspberryPi Camera sensor V2 driver ) to config.txt.
-Also, this will enable adding Contiguous Memory Allocation value in the cmdline.txt.
+This will add the device tree overlay imx219 ( Raspberry Pi Camera Module V2 sensor driver
+) to config.txt. Also, this will enable adding Contiguous Memory Allocation value in the
+cmdline.txt.
-Ref.:
-* <https://github.com/raspberrypi/documentation/blob/master/linux/software/libcamera/README.md>
+Similarly, the Raspberry Pi Camera Module v3 also has to be explicitly enabled in local.conf.
+
+ RASPBERRYPI_CAMERA_V3 = "1"
+
+This will add the device tree overlay imx708 ( Raspberry Pi Camera Module V3 sensor driver )
+to config.txt.
+
+See:
+* <https://www.raspberrypi.com/documentation/computers/camera_software.html>
* <https://www.raspberrypi.org/blog/an-open-source-camera-stack-for-raspberry-pi-using-libcamera/>
## WM8960 soundcard support
Support for WM8960 based sound cards such as the WM8960 Hi-Fi Sound Card HAT for Raspberry Pi from Waveshare, and ReSpeaker 2 / 4 / 6 Mics Pi HAT from Seeed Studio, can be enabled in `local.conf`
- ```conf
MACHINE_FEATURES += "wm8960"
- ```
You may need to adjust volume and toggle switches that are off by default
- ```bash
amixer -c1 sset 'Headphone',0 80%,80%
amixer -c1 sset 'Speaker',0 80%,80%
amixer -c1 sset 'Left Input Mixer Boost' toggle
amixer -c1 sset 'Left Output Mixer PCM' toggle
amixer -c1 sset 'Right Input Mixer Boost' toggle
amixer -c1 sset 'Right Output Mixer PCM' toggle
- ```
Audio capture on ReSpeaker 2 / 4 / 6 Mics Pi HAT from Seeed Studio is very noisy.
@@ -434,3 +471,34 @@ the device tree is properly tweaked. Also, mind the runtime components that
take advantage of your RTC device. You can do that by checking what is
included/configured in the build system based on the inclusion of `rtc` in
`MACHINE_FEATURES`.
+
+## Raspberry Pi Distro VLC
+
+To enable Raspberry Pi Distro VLC, the `meta-openembedded/meta-multimedia` layer must be
+included in your `bblayers.conf`.
+
+VLC does not support HW accelerated video decode through MMAL on a 64-bit OS.
+
+See:
+* <https://forums.raspberrypi.com/viewtopic.php?t=275370>
+* <https://forums.raspberrypi.com/viewtopic.php?t=325218#p1946169>
+
+MMAL is not enabled by default. To enable it add
+
+ DISABLE_VC4GRAPHICS = "1"
+
+to `local.conf`. Adding `vlc` to `IMAGE_INSTALL` will then default to building the Raspberry
+Pi's Distro implementation of VLC with HW accelerated video decode through MMAL into the system
+image. It also defaults to building VLC with Raspberry PI's Distro implementation of ffmpeg. The
+oe-core implementation of ffmpeg and the meta-openembedded/meta-multimedia implementation of VLC
+can however be selected via:
+
+ PREFERRED_PROVIDER_ffmpeg = "ffmpeg"
+ PREFERRED_PROVIDER_vlc = "vlc"
+
+Usage example: Start VLC with mmal_vout plugin and without an active display server.
+
+ DISPLAYNUM=$(tvservice -l | tail -c 2)
+ MMAL_DISPLAY=$(expr $DISPLAYNUM + 1)
+ VLC_SETTINGS="-I dummy --vout=mmal_vout --mmal-resize --mmal-display hdmi-$MMAL_DISPLAY --no-dbus"
+ cvlc $VLC_SETTINGS <video/playlist>
diff --git a/docs/index.rst b/docs/index.rst
index 0d7ee07..3f8a088 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,6 +15,7 @@ Contents:
layer-contents
extra-build-config
extra-apps
+ ipcompliance
contributing
Indices and tables
diff --git a/docs/ipcompliance.md b/docs/ipcompliance.md
new file mode 100644
index 0000000..01540a8
--- /dev/null
+++ b/docs/ipcompliance.md
@@ -0,0 +1,23 @@
+# IP Compliance
+
+## linux-firmware-rpidistro
+
+By default, some of the machine configurations recommend packages for the
+WiFi/BT firmware, provided by
+[linux-firmware-rpidistro](https://github.com/RPi-Distro/firmware-nonfree).
+This package includes some firmware blobs under the `Synaptics` license which
+could carry a legal risk: one of the clauses can be (at least theoretically)
+used as a `killswitch`. This was
+[reported](https://github.com/RPi-Distro/firmware-nonfree/issues/29) in the
+upstream repository.
+
+You can find the full license text body in the content of the above mentioned
+package.
+
+Due to the above, the build system will only allow this recipe to be built if
+the user acknowledges this risk by adding the following configuration:
+
+ LICENSE_FLAGS_ACCEPTED = "synaptics-killswitch"
+
+You can provide this configuration as part of your `local.conf`, `distro.conf`,
+etc.
diff --git a/docs/layer-contents.md b/docs/layer-contents.md
index d12cb88..3882339 100644
--- a/docs/layer-contents.md
+++ b/docs/layer-contents.md
@@ -16,6 +16,28 @@
Note: The raspberrypi3 machines include support for Raspberry Pi 3B+.
+## Multi-board Machines
+
+This layer generally provides support for machines that are targetting a single
+Raspberry Pi board (or a very few subsets of them). This is so that the build
+infrastructure can tune and tweak the configuration with the flexibility to
+optimise for both runtime performance and disk storage.
+
+For usecases where compatibility of more boards is required, the layer provides
+machines that are tagetting a wider support of Raspberry Pi boards.
+
+### raspberrypi-armv7
+
+This machine targets support for all the ARMv7-based Raspberry Pi boards. It
+will pull in the firmware and deploy the kernel image and kernel modules for
+all the relevant boards.
+
+### raspberrypi-armv8
+
+This machine targets support for all the ARMv8-based Raspberry Pi boards. It
+will pull in the firmware and deploy the kernel image and kernel modules for
+all the relevant boards.
+
## Images
* rpi-test-image
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 9e4694f..51eebd0 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1 +1,2 @@
myst_parser
+sphinx_rtd_theme
diff --git a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch b/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch
deleted file mode 100644
index c0fdd18..0000000
--- a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch
+++ /dev/null
@@ -1,175 +0,0 @@
-diff -ruN lirc-0.10.1.orig/lib/config_file.c lirc-0.10.1/lib/config_file.c
---- lirc-0.10.1.orig/lib/config_file.c 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/lib/config_file.c 2019-06-26 00:39:45.734320696 +0900
-@@ -71,7 +71,7 @@
- typedef void* (*array_guest_func)(void* item, void* arg);
-
-
--#define LINE_LEN 1024
-+#define LINE_LEN 4096
- #define MAX_INCLUDES 10
-
- const char* whitespace = " \t";
-diff -ruN lirc-0.10.1.orig/lib/ir_remote.h lirc-0.10.1/lib/ir_remote.h
---- lirc-0.10.1.orig/lib/ir_remote.h 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/lib/ir_remote.h 2019-06-26 00:39:45.714321224 +0900
-@@ -110,12 +110,17 @@
-
- static inline int is_pulse(lirc_t data)
- {
-- return data & PULSE_BIT ? 1 : 0;
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_PULSE) ? 1 : 0;
- }
-
- static inline int is_space(lirc_t data)
- {
-- return !is_pulse(data);
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_SPACE) ? 1 : 0;
-+}
-+
-+static inline int is_timeout(lirc_t data)
-+{
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_TIMEOUT) ? 1 : 0;
- }
-
- static inline int has_repeat(const struct ir_remote* remote)
-diff -ruN lirc-0.10.1.orig/lib/irrecord.c lirc-0.10.1/lib/irrecord.c
---- lirc-0.10.1.orig/lib/irrecord.c 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/lib/irrecord.c 2019-06-26 00:39:45.724320960 +0900
-@@ -1398,9 +1398,16 @@
- state->retval = 0;
- return STS_LEN_TIMEOUT;
- }
-+ if (is_timeout(state->data)) {
-+ return STS_LEN_AGAIN;
-+ }
- state->count++;
- if (state->mode == MODE_GET_GAP) {
-- state->sum += state->data & PULSE_MASK;
-+ if (state->sum != 0 || is_pulse(state->data)) {
-+ state->sum += state->data & PULSE_MASK;
-+ }else{
-+ return STS_LEN_AGAIN;
-+ }
- if (state->average == 0 && is_space(state->data)) {
- if (state->data > 100000) {
- state->sum = 0;
-@@ -1472,6 +1479,10 @@
- state->keypresses = lastmaxcount;
- return STS_LEN_AGAIN;
- } else if (state->mode == MODE_HAVE_GAP) {
-+ if (state->count==1 && is_space(state->data)) {
-+ state->count = 0;
-+ return STS_LEN_AGAIN;
-+ }
- if (state->count <= MAX_SIGNALS) {
- signals[state->count - 1] = state->data & PULSE_MASK;
- } else {
-@@ -1510,7 +1521,7 @@
- /* such long pulses may appear with
- * crappy hardware (receiver? / remote?)
- */
-- else {
-+ else if(is_pulse(state->data)) {
- remote->gap = 0;
- return STS_LEN_NO_GAP_FOUND;
- }
-@@ -1811,22 +1822,24 @@
-
- static int raw_data_ok(struct button_state* btn_state)
- {
-- int r;
-+ int r = 0;
- int ref;
-
-- if (!is_space(btn_state->data)) {
-+ if (is_pulse(btn_state->data)) {
- r = 0;
-- } else if (is_const(&remote)) {
-- if (remote.gap > btn_state->sum) {
-- ref = (remote.gap - btn_state->sum);
-- ref *= (100 - remote.eps);
-- ref /= 100;
-+ } else if (is_space(btn_state->data)) {
-+ if (is_const(&remote)) {
-+ if (remote.gap > btn_state->sum) {
-+ ref = (remote.gap - btn_state->sum);
-+ ref *= (100 - remote.eps);
-+ ref /= 100;
-+ } else {
-+ ref = 0;
-+ }
-+ r = btn_state->data > ref;
- } else {
-- ref = 0;
-+ r = btn_state->data > (remote.gap * (100 - remote.eps)) / 100;
- }
-- r = btn_state->data > ref;
-- } else {
-- r = btn_state->data > (remote.gap * (100 - remote.eps)) / 100;
- }
- return r;
- }
-@@ -1970,7 +1983,7 @@
- btn_state->data = remote.gap;
- }
- if (btn_state->count == 0) {
-- if (!is_space(btn_state->data)
-+ if (is_pulse(btn_state->data)
- || btn_state->data <
- remote.gap - remote.gap * remote.eps /
- 100) {
-diff -ruN lirc-0.10.1.orig/lib/lirc/ir_remote.h lirc-0.10.1/lib/lirc/ir_remote.h
---- lirc-0.10.1.orig/lib/lirc/ir_remote.h 2017-09-10 17:52:58.000000000 +0900
-+++ lirc-0.10.1/lib/lirc/ir_remote.h 2019-06-26 00:39:45.724320960 +0900
-@@ -110,12 +110,17 @@
-
- static inline int is_pulse(lirc_t data)
- {
-- return data & PULSE_BIT ? 1 : 0;
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_PULSE) ? 1 : 0;
- }
-
- static inline int is_space(lirc_t data)
- {
-- return !is_pulse(data);
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_SPACE) ? 1 : 0;
-+}
-+
-+static inline int is_timeout(lirc_t data)
-+{
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_TIMEOUT) ? 1 : 0;
- }
-
- static inline int has_repeat(const struct ir_remote* remote)
-diff -ruN lirc-0.10.1.orig/tools/mode2.cpp lirc-0.10.1/tools/mode2.cpp
---- lirc-0.10.1.orig/tools/mode2.cpp 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/tools/mode2.cpp 2019-06-26 00:45:38.840404976 +0900
-@@ -326,12 +326,24 @@
- void print_mode2_data(unsigned int data)
- {
- static int bitno = 1;
-+ static bool leading_space = true;
-+ unsigned int msg = data & LIRC_MODE2_MASK;
-
- switch (opt_dmode) {
- case 0:
-- printf("%s %u\n", (
-- data & PULSE_BIT) ? "pulse" : "space",
-- (uint32_t)(data & PULSE_MASK));
-+ if (leading_space && msg == LIRC_MODE2_SPACE ) {
-+ break;
-+ } else {
-+ leading_space = false;
-+ }
-+ if (msg == LIRC_MODE2_PULSE) {
-+ printf("pulse %u\n", (__u32)(data & PULSE_MASK));
-+ } else if (msg == LIRC_MODE2_SPACE) {
-+ printf("space %u\n", (__u32)(data & PULSE_MASK));
-+ } else if (msg == LIRC_MODE2_TIMEOUT) {
-+ printf("timeout %u\n", (__u32)(data & PULSE_MASK));
-+ leading_space = true;
-+ }
- break;
- case 1: {
- /* print output like irrecord raw config file data */
diff --git a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.1.bbappend b/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.%.bbappend
index 22f8ce4..0ccd4f7 100644
--- a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.1.bbappend
+++ b/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.%.bbappend
@@ -1,6 +1,5 @@
FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
SRC_URI:append:rpi = " \
- file://lirc-gpio-ir-0.10.patch \
file://lircd.service \
"
diff --git a/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb b/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb
index fd67580..cf745fc 100644
--- a/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb
+++ b/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb
@@ -18,7 +18,7 @@ DEPENDS += " \
"
RDEPENDS:${PN} += " \
- ${PYTHON_PN}-numpy \
- ${PYTHON_PN}-rtimu \
- ${PYTHON_PN}-pillow \
+ python3-numpy \
+ python3-rtimu \
+ python3-pillow \
"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch
new file mode 100644
index 0000000..15f6bf4
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch
@@ -0,0 +1,42 @@
+From bbc1ea3e4119c665723cfd1c5a364bc8c7cbb464 Mon Sep 17 00:00:00 2001
+From: Martin Jansa <Martin.Jansa@gmail.com>
+Date: Thu, 4 May 2023 18:07:16 +0000
+Subject: [PATCH] utils/version.py: use /usr/bin/env in shebang
+
+* it uses subprocess text=True which is available only since python-3.7
+ when running on host with python-3.6 it fails with:
+Traceback (most recent call last):
+ File "TOPDIR/BUILD/work/raspberrypi4_64-oe-linux/rpi-libcamera-apps/git-r0/git/utils/version.py", line 19, in generate_version
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, text=True)
+ File "/usr/lib/python3.6/subprocess.py", line 423, in run
+ with Popen(*popenargs, **kwargs) as process:
+TypeError: __init__() got an unexpected keyword argument 'text'
+
+During handling of the above exception, another exception occurred:
+
+Traceback (most recent call last):
+ File "TOPDIR/BUILD/work/raspberrypi4_64-oe-linux/rpi-libcamera-apps/git-r0/git/utils/version.py", line 52, in <module>
+ generate_version()
+ File "TOPDIR/BUILD/work/raspberrypi4_64-oe-linux/rpi-libcamera-apps/git-r0/git/utils/version.py", line 48, in generate_version
+ print(f'{commit} {datetime.now().strftime("%d-%m-%Y (%H:%M:%S)")}', end="")
+UnboundLocalError: local variable 'commit' referenced before assignment
+Generating version string:
+
+ even when newer python3 is in PATH (either from buildtools or from python3native)
+
+Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
+Upstream-Status: Pending
+---
+ utils/version.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/utils/version.py b/utils/version.py
+index 48d7e05..4a5e35c 100755
+--- a/utils/version.py
++++ b/utils/version.py
+@@ -1,4 +1,4 @@
+-#!/usr/bin/python3
++#!/usr/bin/env python3
+
+ # Copyright (C) 2021, Raspberry Pi (Trading) Limited
+ # Generate version information for rpicam-apps
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch
new file mode 100644
index 0000000..c965b2c
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch
@@ -0,0 +1,271 @@
+From 500f1e9eaeca29b255d0364e1383d70ade1d1177 Mon Sep 17 00:00:00 2001
+From: Martin Jansa <martin.jansa@gmail.com>
+Date: Tue, 30 Jan 2024 12:02:09 +0000
+Subject: [PATCH] Revert "Support compressed pixel formats when saving DNGs"
+
+This reverts commit a85aed7603a0b69a6685d3f81ee860246d5b1621.
+
+This requires rpi specific fork of libcamera to provide e.g.
+formats::RGGB16_PISP_COMP1
+added in:
+https://github.com/raspberrypi/libcamera/commit/fb3cb844f2117f30d3eeece99d6ce4d02624e492
+but not included in libcamera from meta-oe:
+https://git.openembedded.org/meta-openembedded/commit/?id=711c6fbce39df685225bca081c5f42bae2de658b
+
+See https://github.com/raspberrypi/rpicam-apps/issues/627
+
+Upstream-Status: Pending
+---
+ image/dng.cpp | 205 ++++++++------------------------------------------
+ 1 file changed, 33 insertions(+), 172 deletions(-)
+
+diff --git a/image/dng.cpp b/image/dng.cpp
+index 7692f92..fc10439 100644
+--- a/image/dng.cpp
++++ b/image/dng.cpp
+@@ -33,47 +33,40 @@ struct BayerFormat
+ int bits;
+ char const *order;
+ bool packed;
+- bool compressed;
+ };
+
+ static const std::map<PixelFormat, BayerFormat> bayer_formats =
+ {
+- { formats::SRGGB10_CSI2P, { "RGGB-10", 10, TIFF_RGGB, true, false } },
+- { formats::SGRBG10_CSI2P, { "GRBG-10", 10, TIFF_GRBG, true, false } },
+- { formats::SBGGR10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true, false } },
+- { formats::SGBRG10_CSI2P, { "GBRG-10", 10, TIFF_GBRG, true, false } },
+-
+- { formats::SRGGB10, { "RGGB-10", 10, TIFF_RGGB, false, false } },
+- { formats::SGRBG10, { "GRBG-10", 10, TIFF_GRBG, false, false } },
+- { formats::SBGGR10, { "BGGR-10", 10, TIFF_BGGR, false, false } },
+- { formats::SGBRG10, { "GBRG-10", 10, TIFF_GBRG, false, false } },
+-
+- { formats::SRGGB12_CSI2P, { "RGGB-12", 12, TIFF_RGGB, true, false } },
+- { formats::SGRBG12_CSI2P, { "GRBG-12", 12, TIFF_GRBG, true, false } },
+- { formats::SBGGR12_CSI2P, { "BGGR-12", 12, TIFF_BGGR, true, false } },
+- { formats::SGBRG12_CSI2P, { "GBRG-12", 12, TIFF_GBRG, true, false } },
+-
+- { formats::SRGGB12, { "RGGB-12", 12, TIFF_RGGB, false, false } },
+- { formats::SGRBG12, { "GRBG-12", 12, TIFF_GRBG, false, false } },
+- { formats::SBGGR12, { "BGGR-12", 12, TIFF_BGGR, false, false } },
+- { formats::SGBRG12, { "GBRG-12", 12, TIFF_GBRG, false, false } },
+-
+- { formats::SRGGB16, { "RGGB-16", 16, TIFF_RGGB, false, false } },
+- { formats::SGRBG16, { "GRBG-16", 16, TIFF_GRBG, false, false } },
+- { formats::SBGGR16, { "BGGR-16", 16, TIFF_BGGR, false, false } },
+- { formats::SGBRG16, { "GBRG-16", 16, TIFF_GBRG, false, false } },
+-
+- { formats::R10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true, false } },
+- { formats::R10, { "BGGR-10", 10, TIFF_BGGR, false, false } },
++ { formats::SRGGB10_CSI2P, { "RGGB-10", 10, TIFF_RGGB, true } },
++ { formats::SGRBG10_CSI2P, { "GRBG-10", 10, TIFF_GRBG, true } },
++ { formats::SBGGR10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true } },
++ { formats::SGBRG10_CSI2P, { "GBRG-10", 10, TIFF_GBRG, true } },
++
++ { formats::SRGGB10, { "RGGB-10", 10, TIFF_RGGB, false } },
++ { formats::SGRBG10, { "GRBG-10", 10, TIFF_GRBG, false } },
++ { formats::SBGGR10, { "BGGR-10", 10, TIFF_BGGR, false } },
++ { formats::SGBRG10, { "GBRG-10", 10, TIFF_GBRG, false } },
++
++ { formats::SRGGB12_CSI2P, { "RGGB-12", 12, TIFF_RGGB, true } },
++ { formats::SGRBG12_CSI2P, { "GRBG-12", 12, TIFF_GRBG, true } },
++ { formats::SBGGR12_CSI2P, { "BGGR-12", 12, TIFF_BGGR, true } },
++ { formats::SGBRG12_CSI2P, { "GBRG-12", 12, TIFF_GBRG, true } },
++
++ { formats::SRGGB12, { "RGGB-12", 12, TIFF_RGGB, false } },
++ { formats::SGRBG12, { "GRBG-12", 12, TIFF_GRBG, false } },
++ { formats::SBGGR12, { "BGGR-12", 12, TIFF_BGGR, false } },
++ { formats::SGBRG12, { "GBRG-12", 12, TIFF_GBRG, false } },
++
++ { formats::SRGGB16, { "RGGB-16", 16, TIFF_RGGB, false } },
++ { formats::SGRBG16, { "GRBG-16", 16, TIFF_GRBG, false } },
++ { formats::SBGGR16, { "BGGR-16", 16, TIFF_BGGR, false } },
++ { formats::SGBRG16, { "GBRG-16", 16, TIFF_GBRG, false } },
++
++ { formats::R10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true } },
++ { formats::R10, { "BGGR-10", 10, TIFF_BGGR, false } },
+ // Currently not in the main libcamera branch
+ //{ formats::R12_CSI2P, { "BGGR-12", 12, TIFF_BGGR, true } },
+- { formats::R12, { "BGGR-12", 12, TIFF_BGGR, false, false } },
+-
+- /* PiSP compressed formats. */
+- { formats::RGGB16_PISP_COMP1, { "RGGB-16-PISP", 16, TIFF_RGGB, false, true } },
+- { formats::GRBG16_PISP_COMP1, { "GRBG-16-PISP", 16, TIFF_GRBG, false, true } },
+- { formats::GBRG16_PISP_COMP1, { "GBRG-16-PISP", 16, TIFF_GBRG, false, true } },
+- { formats::BGGR16_PISP_COMP1, { "BGGR-16-PISP", 16, TIFF_BGGR, false, true } },
++ { formats::R12, { "BGGR-12", 12, TIFF_BGGR, false } },
+ };
+
+ static void unpack_10bit(uint8_t const *src, StreamInfo const &info, uint16_t *dest)
+@@ -124,129 +117,6 @@ static void unpack_16bit(uint8_t const *src, StreamInfo const &info, uint16_t *d
+ }
+ }
+
+-// We always use these compression parameters.
+-#define COMPRESS_OFFSET 2048
+-#define COMPRESS_MODE 1
+-
+-static uint16_t postprocess(uint16_t a)
+-{
+- if (COMPRESS_MODE & 2)
+- {
+- if (COMPRESS_MODE == 3 && a < 0x4000)
+- a = a >> 2;
+- else if (a < 0x1000)
+- a = a >> 4;
+- else if (a < 0x1800)
+- a = (a - 0x800) >> 3;
+- else if (a < 0x3000)
+- a = (a - 0x1000) >> 2;
+- else if (a < 0x6000)
+- a = (a - 0x2000) >> 1;
+- else if (a < 0xC000)
+- a = (a - 0x4000);
+- else
+- a = 2 * (a - 0x8000);
+- }
+-
+- return std::min(0xFFFF, a + COMPRESS_OFFSET);
+-}
+-
+-static uint16_t dequantize(uint16_t q, int qmode)
+-{
+- switch (qmode)
+- {
+- case 0:
+- return (q < 320) ? 16 * q : 32 * (q - 160);
+-
+- case 1:
+- return 64 * q;
+-
+- case 2:
+- return 128 * q;
+-
+- default:
+- return (q < 94) ? 256 * q : std::min(0xFFFF, 512 * (q - 47));
+- }
+-}
+-
+-static void subBlockFunction(uint16_t *d, uint32_t w)
+-{
+- int q[4];
+-
+- int qmode = (w & 3);
+- if (qmode < 3)
+- {
+- int field0 = (w >> 2) & 511;
+- int field1 = (w >> 11) & 127;
+- int field2 = (w >> 18) & 127;
+- int field3 = (w >> 25) & 127;
+- if (qmode == 2 && field0 >= 384)
+- {
+- q[1] = field0;
+- q[2] = field1 + 384;
+- }
+- else
+- {
+- q[1] = (field1 >= 64) ? field0 : field0 + 64 - field1;
+- q[2] = (field1 >= 64) ? field0 + field1 - 64 : field0;
+- }
+- int p1 = std::max(0, q[1] - 64);
+- if (qmode == 2)
+- p1 = std::min(384, p1);
+- int p2 = std::max(0, q[2] - 64);
+- if (qmode == 2)
+- p2 = std::min(384, p2);
+- q[0] = p1 + field2;
+- q[3] = p2 + field3;
+- }
+- else
+- {
+- int pack0 = (w >> 2) & 32767;
+- int pack1 = (w >> 17) & 32767;
+- q[0] = (pack0 & 15) + 16 * ((pack0 >> 8) / 11);
+- q[1] = (pack0 >> 4) % 176;
+- q[2] = (pack1 & 15) + 16 * ((pack1 >> 8) / 11);
+- q[3] = (pack1 >> 4) % 176;
+- }
+-
+- d[0] = dequantize(q[0], qmode);
+- d[2] = dequantize(q[1], qmode);
+- d[4] = dequantize(q[2], qmode);
+- d[6] = dequantize(q[3], qmode);
+-}
+-
+-static void uncompress(uint8_t const *src, StreamInfo const &info, uint16_t *dest)
+-{
+- // In all cases, the *decompressed* image must be a multiple of 8 columns wide.
+- unsigned int buf_stride_pixels = (info.width + 7) & ~7;
+- for (unsigned int y = 0; y < info.height; ++y)
+- {
+- uint16_t *dp = dest + y * buf_stride_pixels;
+- uint8_t const *sp = src + y * info.stride;
+-
+- for (unsigned int x = 0; x < info.width; x+=8)
+- {
+- if (COMPRESS_MODE & 1)
+- {
+- uint32_t w0 = 0, w1 = 0;
+- for (int b = 0; b < 4; ++b)
+- w0 |= (*sp++) << (b * 8);
+- for (int b = 0; b < 4; ++b)
+- w1 |= (*sp++) << (b * 8);
+- subBlockFunction(dp, w0);
+- subBlockFunction(dp + 1, w1);
+- for (int i = 0; i < 8; ++i, ++dp)
+- *dp = postprocess(*dp);
+- }
+- else
+- {
+- for (int i = 0; i < 8; ++i)
+- *dp++ = postprocess((*sp++) << 8);
+- }
+- }
+- }
+-}
+-
+ struct Matrix
+ {
+ Matrix(float m0, float m1, float m2,
+@@ -307,16 +177,8 @@ void dng_save(std::vector<libcamera::Span<uint8_t>> const &mem, StreamInfo const
+ BayerFormat const &bayer_format = it->second;
+ LOG(1, "Bayer format is " << bayer_format.name);
+
+- // Decompression will require a buffer that's 8 pixels aligned.
+- unsigned int buf_stride_pixels = info.width;
+- unsigned int buf_stride_pixels_padded = (buf_stride_pixels + 7) & ~7;
+- std::vector<uint16_t> buf(buf_stride_pixels_padded * info.height);
+- if (bayer_format.compressed)
+- {
+- uncompress(mem[0].data(), info, &buf[0]);
+- buf_stride_pixels = buf_stride_pixels_padded;
+- }
+- else if (bayer_format.packed)
++ std::vector<uint16_t> buf(info.width * info.height);
++ if (bayer_format.packed)
+ {
+ switch (bayer_format.bits)
+ {
+@@ -444,9 +306,8 @@ void dng_save(std::vector<libcamera::Span<uint8_t>> const &mem, StreamInfo const
+ {
+ for (unsigned int x = 0; x < (info.width >> 4); x++)
+ {
+- unsigned int off = (y * buf_stride_pixels + x) << 4;
+- uint32_t grey =
+- buf[off] + buf[off + 1] + buf[off + buf_stride_pixels] + buf[off + buf_stride_pixels + 1];
++ unsigned int off = (y * info.width + x) << 4;
++ uint32_t grey = buf[off] + buf[off + 1] + buf[off + info.width] + buf[off + info.width + 1];
+ grey = (grey << 14) >> bayer_format.bits;
+ grey = sqrt((double)grey); // simple "gamma correction"
+ thumb_buf[3 * x] = thumb_buf[3 * x + 1] = thumb_buf[3 * x + 2] = grey;
+@@ -478,7 +339,7 @@ void dng_save(std::vector<libcamera::Span<uint8_t>> const &mem, StreamInfo const
+
+ for (unsigned int y = 0; y < info.height; y++)
+ {
+- if (TIFFWriteScanline(tif, &buf[buf_stride_pixels * y], y, 0) != 1)
++ if (TIFFWriteScanline(tif, &buf[info.width * y], y, 0) != 1)
+ throw std::runtime_error("error writing DNG image data");
+ }
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb
new file mode 100644
index 0000000..dc07145
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb
@@ -0,0 +1,44 @@
+SUMMARY = "A suite of libcamera-based apps"
+DESCRIPTION = "This is a small suite of libcamera-based apps that aim to \
+copy the functionality of the existing \"raspicam\" apps."
+HOMEPAGE = "https://github.com/raspberrypi/libcamera-apps"
+SECTION = "console/utils"
+
+LICENSE = "BSD-2-Clause"
+LIC_FILES_CHKSUM = "file://license.txt;md5=a0013d1b383d72ba4bdc5b750e7d1d77"
+
+SRC_URI = "\
+ git://github.com/raspberrypi/libcamera-apps.git;protocol=https;branch=main \
+ file://0001-utils-version.py-use-usr-bin-env-in-shebang.patch \
+ file://0002-Revert-Support-compressed-pixel-formats-when-saving-.patch \
+"
+PV = "1.4.2+git${SRCPV}"
+SRCREV = "9ae39f85ae6bee9761c36b9b5b80d675bc1fa369"
+
+S = "${WORKDIR}/git"
+
+DEPENDS = "libcamera libexif jpeg tiff libpng boost"
+
+PACKAGECONFIG ??= "drm"
+PACKAGECONFIG[libav] = "-Denable_libav=true, -Denable_libav=false, libav"
+PACKAGECONFIG[drm] = "-Denable_drm=true, -Denable_drm=false, libdrm"
+PACKAGECONFIG[egl] = "-Denable_egl=true, -Denable_egl=false, virtual/egl"
+PACKAGECONFIG[qt] = "-Denable_qt=true, -Denable_qt=false, qtbase"
+PACKAGECONFIG[opencv] = "-Denable_opencv=true, -Denable_opencv=false, opencv"
+PACKAGECONFIG[tflite] = "-Denable_tflite=true, -Denable_tflite=false, tensorflow-lite"
+
+inherit meson pkgconfig
+
+NEON_FLAGS = ""
+NEON_FLAGS:aarch64 = "-Dneon_flags=arm64"
+NEON_FLAGS:arm:raspberrypi3 = "-Dneon_flags=armv8-neon"
+NEON_FLAGS:arm:raspberrypi4 = "-Dneon_flags=armv8-neon"
+EXTRA_OEMESON += "${NEON_FLAGS}"
+
+# QA Issue: /usr/bin/camera-bug-report contained in package libcamera-apps requires /usr/bin/python3
+do_install:append() {
+ rm -v ${D}/${bindir}/camera-bug-report
+}
+
+# not picked automatically, because it's missing common 'lib' prefix
+FILES:${PN}-dev += "${libdir}/rpicam_app.so"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend
deleted file mode 100644
index 2721cfe..0000000
--- a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend
+++ /dev/null
@@ -1,2 +0,0 @@
-PACKAGECONFIG[raspberrypi] = "-Dpipelines=raspberrypi"
-PACKAGECONFIG:append:rpi = " raspberrypi"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend
new file mode 100644
index 0000000..541c49c
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend
@@ -0,0 +1,2 @@
+PACKAGECONFIG[raspberrypi] = "-Dpipelines=rpi/vc4 -Dipas=rpi/vc4 -Dcpp_args=-Wno-unaligned-access"
+PACKAGECONFIG:append:rpi = " raspberrypi"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch
new file mode 100644
index 0000000..3be8f1e
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch
@@ -0,0 +1,25 @@
+From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net>
+Date: Sat, 16 Jun 2018 21:31:45 +0300
+Subject: configure: fix linking on RISC-V ISA
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+---
+ configure.ac | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/configure.ac b/configure.ac
+index 2037a9e..df26367 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -113,6 +113,7 @@ case "${host_os}" in
+ ;;
+ linux*)
+ SYS=linux
++ test "${host_cpu}" = "riscv64" && CFLAGS="${CFLAGS} -pthread"
+ ;;
+ bsdi*)
+ SYS=bsdi
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch
new file mode 100644
index 0000000..61807b3
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch
@@ -0,0 +1,27 @@
+From: Sebastian Ramacher <sramacher@debian.org>
+Date: Mon, 19 Aug 2019 21:08:26 +0200
+Subject: Revert "configure: Require libmodplug >= 0.8.9"
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+This reverts commit 48f014768dc22ecad23d0e9f53c38805a3aff832.
+---
+ configure.ac | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/configure.ac b/configure.ac
+index df26367..b8580ec 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -2207,7 +2207,7 @@ AC_ARG_ENABLE(mod,
+ [AS_HELP_STRING([--disable-mod],
+ [do not use libmodplug (default auto)])])
+ if test "${enable_mod}" != "no" ; then
+- PKG_CHECK_MODULES(LIBMODPLUG, [libmodplug >= 0.8.9.0], [
++ PKG_CHECK_MODULES(LIBMODPLUG, [libmodplug >= 0.8.4 libmodplug != 0.8.8], [
+ VLC_ADD_PLUGIN([mod])
+ VLC_ADD_CXXFLAGS([mod],[$LIBMODPLUG_CFLAGS])
+ VLC_ADD_CFLAGS([mod],[$LIBMODPLUG_CFLAGS]) #modules/demux/mod.c needs CFLAGS_mod, not CXXFLAGS_mod
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch
new file mode 100644
index 0000000..41f7109
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch
@@ -0,0 +1,83 @@
+From 4fcace61801f418786c42487c6b06b693ee87666 Mon Sep 17 00:00:00 2001
+From: Romain Vimont <rom1v@videolabs.io>
+Date: Mon, 19 Sep 2022 17:17:01 +0200
+Subject: [PATCH] vnc: fix possible buffer overflow
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+Thanks to 0xMitsurugi [1] from Synacktiv [2] for the bug report and fix.
+
+[1] https://twitter.com/0xMitsurugi
+[2] https://www.synacktiv.com/
+
+Fixes #27335
+
+(cherry picked from commit 5eb783fd44ed6298db3e38f7765f21c42e4405f9)
+---
+ modules/access/vnc.c | 23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+--- a/modules/access/vnc.c
++++ b/modules/access/vnc.c
+@@ -33,6 +33,7 @@
+ #ifdef HAVE_CONFIG_H
+ # include "config.h"
+ #endif
++#include <assert.h>
+
+ #include <vlc_common.h>
+ #include <vlc_plugin.h>
+@@ -115,7 +116,7 @@
+ int i_cancel_state;
+
+ rfbClient* p_client;
+- int i_framebuffersize;
++ size_t i_framebuffersize;
+ block_t *p_block;
+
+ float f_fps;
+@@ -143,11 +144,16 @@
+ p_sys->es = NULL;
+ }
+
+- int i_width = p_client->width;
+- int i_height = p_client->height;
+- int i_depth = p_client->format.bitsPerPixel;
++ assert(!(p_client->width & ~0xffff)); // fits in 16 bits
++ uint16_t i_width = p_client->width;
+
+- switch( i_depth )
++ assert(!(p_client->height & ~0xffff)); // fits in 16 bits
++ uint16_t i_height = p_client->height;
++
++ uint8_t i_bits_per_pixel = p_client->format.bitsPerPixel;
++ assert((i_bits_per_pixel & 0x7) == 0); // multiple of 8
++
++ switch( i_bits_per_pixel )
+ {
+ case 8:
+ i_chroma = VLC_CODEC_RGB8;
+@@ -180,7 +186,10 @@
+ }
+
+ /* Set up framebuffer */
+- p_sys->i_framebuffersize = i_width * i_height * i_depth / 8;
++ if (mul_overflow(i_width, i_height * (i_bits_per_pixel / 8), &p_sys->i_framebuffersize)) {
++ msg_Err(p_demux, "VNC framebuffersize overflow");
++ return FALSE;
++ }
+
+ /* Reuse unsent block */
+ if ( p_sys->p_block )
+@@ -211,7 +220,7 @@
+ fmt.video.i_frame_rate_base = 1000;
+ fmt.video.i_frame_rate = 1000 * p_sys->f_fps;
+
+- fmt.video.i_bits_per_pixel = i_depth;
++ fmt.video.i_bits_per_pixel = i_bits_per_pixel;
+ fmt.video.i_rmask = p_client->format.redMax << p_client->format.redShift;
+ fmt.video.i_gmask = p_client->format.greenMax << p_client->format.greenShift;
+ fmt.video.i_bmask = p_client->format.blueMax << p_client->format.blueShift;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch
new file mode 100644
index 0000000..ab31730
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch
@@ -0,0 +1,13826 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/configure.ac
++++ b/configure.ac
+@@ -3478,6 +3478,9 @@ dnl
+ AC_ARG_ENABLE(mmal,
+ AS_HELP_STRING([--enable-mmal],
+ [Multi-Media Abstraction Layer (MMAL) hardware plugin (default enable)]))
++AC_ARG_ENABLE(mmal_avcodec,
++ AS_HELP_STRING([--enable-mmal-avcodec],
++ [Use MMAL enabled avcodec libs (default disable)]))
+ if test "${enable_mmal}" != "no"; then
+ VLC_SAVE_FLAGS
+ LDFLAGS="${LDFLAGS} -L/opt/vc/lib -lvchostif"
+@@ -3488,7 +3491,7 @@ if test "${enable_mmal}" != "no"; then
+ VLC_ADD_PLUGIN([mmal])
+ VLC_ADD_LDFLAGS([mmal],[ -L/opt/vc/lib ])
+ VLC_ADD_CFLAGS([mmal],[ -isystem /opt/vc/include -isystem /opt/vc/include/interface/vcos/pthreads -isystem /opt/vc/include/interface/vmcs_host/linux ])
+- VLC_ADD_LIBS([mmal],[ -lbcm_host -lmmal -lmmal_core -lmmal_components -lmmal_util -lvchostif ]) ], [
++ VLC_ADD_LIBS([mmal],[ -lbcm_host -lmmal -lmmal_core -lmmal_components -lmmal_util -lvchostif -lvchiq_arm -lvcsm ]) ], [
+ AS_IF([test "${enable_mmal}" = "yes"],
+ [ AC_MSG_ERROR([Cannot find bcm library...]) ],
+ [ AC_MSG_WARN([Cannot find bcm library...]) ])
+@@ -3500,6 +3503,7 @@ if test "${enable_mmal}" != "no"; then
+ VLC_RESTORE_FLAGS
+ fi
+ AM_CONDITIONAL([HAVE_MMAL], [test "${have_mmal}" = "yes"])
++AM_CONDITIONAL([HAVE_MMAL_AVCODEC], [test "${enable_mmal_avcodec}" = "yes"])
+
+ dnl
+ dnl evas plugin
+--- a/include/vlc_fourcc.h
++++ b/include/vlc_fourcc.h
+@@ -365,6 +365,11 @@
+
+ /* Broadcom MMAL opaque buffer type */
+ #define VLC_CODEC_MMAL_OPAQUE VLC_FOURCC('M','M','A','L')
++#define VLC_CODEC_MMAL_ZC_SAND8 VLC_FOURCC('Z','S','D','8')
++#define VLC_CODEC_MMAL_ZC_SAND10 VLC_FOURCC('Z','S','D','0')
++#define VLC_CODEC_MMAL_ZC_SAND30 VLC_FOURCC('Z','S','D','3')
++#define VLC_CODEC_MMAL_ZC_I420 VLC_FOURCC('Z','4','2','0')
++#define VLC_CODEC_MMAL_ZC_RGB32 VLC_FOURCC('Z','R','G','B')
+
+ /* DXVA2 opaque video surface for use with D3D9 */
+ #define VLC_CODEC_D3D9_OPAQUE VLC_FOURCC('D','X','A','9') /* 4:2:0 8 bpc */
+--- a/modules/hw/mmal/Makefile.am
++++ b/modules/hw/mmal/Makefile.am
+@@ -1,23 +1,57 @@
+ include $(top_srcdir)/modules/common.am
+ mmaldir = $(pluginsdir)/mmal
+
+-AM_CFLAGS += $(CFLAGS_mmal)
+-AM_LDFLAGS += -rpath '$(mmaldir)' $(LDFLAGS_mmal)
++AM_CFLAGS += -pthread $(CFLAGS_mmal)
++AM_LDFLAGS += -pthread -rpath '$(mmaldir)' $(LDFLAGS_mmal)
+
+-libmmal_vout_plugin_la_SOURCES = vout.c mmal_picture.c mmal_picture.h
++libmmal_vout_plugin_la_SOURCES = vout.c mmal_cma.c mmal_picture.c subpic.c\
++ mmal_cma.h mmal_picture.h subpic.h transform_ops.h\
++ mmal_piccpy_neon.S
+ libmmal_vout_plugin_la_CFLAGS = $(AM_CFLAGS)
+-libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm
++libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm -lX11 -lXrandr
+ libmmal_vout_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES = libmmal_vout_plugin.la
+
+-libmmal_codec_plugin_la_SOURCES = codec.c
++libmmal_codec_plugin_la_SOURCES = codec.c mmal_cma.c mmal_picture.c subpic.c\
++ mmal_cma.h mmal_picture.h subpic.h transform_ops.h\
++ blend_rgba_neon.S mmal_piccpy_neon.S
+ libmmal_codec_plugin_la_CFLAGS = $(AM_CFLAGS)
+ libmmal_codec_plugin_la_LDFLAGS = $(AM_LDFLAGS)
+ libmmal_codec_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES += libmmal_codec_plugin.la
+
+-libmmal_deinterlace_plugin_la_SOURCES = deinterlace.c mmal_picture.c
++libmmal_deinterlace_plugin_la_SOURCES = deinterlace.c mmal_picture.c mmal_cma.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
+ libmmal_deinterlace_plugin_la_CFLAGS = $(AM_CFLAGS)
+ libmmal_deinterlace_plugin_la_LDFLAGS = $(AM_LDFLAGS)
+ libmmal_deinterlace_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES += libmmal_deinterlace_plugin.la
++
++libmmal_xsplitter_plugin_la_SOURCES = xsplitter.c mmal_picture.c mmal_cma.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
++libmmal_xsplitter_plugin_la_CFLAGS = $(AM_CFLAGS)
++libmmal_xsplitter_plugin_la_LDFLAGS = $(AM_LDFLAGS)
++libmmal_xsplitter_plugin_la_LIBADD = $(LIBS_mmal)
++mmal_LTLIBRARIES += libmmal_xsplitter_plugin.la
++
++libmmal_converter_plugin_la_SOURCES = converter_mmal.c mmal_cma.c mmal_picture.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
++libmmal_converter_plugin_la_CFLAGS = $(AM_CFLAGS)
++libmmal_converter_plugin_la_LDFLAGS = $(AM_LDFLAGS)
++libmmal_converter_plugin_la_LIBADD = $(LIBS_mmal)
++mmal_LTLIBRARIES += libmmal_converter_plugin.la
++
++if HAVE_MMAL_AVCODEC
++libmmal_avcodec_plugin_la_SOURCES = mmal_avcodec.c mmal_cma.c mmal_picture.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
++libmmal_avcodec_plugin_la_CFLAGS = $(AM_CFLAGS)
++libmmal_avcodec_plugin_la_LDFLAGS = $(AM_LDFLAGS)
++libmmal_avcodec_plugin_la_LIBADD = $(AVFORMAT_LIBS) $(AVUTIL_LIBS) $(LIBS_mmal)
++mmal_LTLIBRARIES += libmmal_avcodec_plugin.la
++endif
++
++
+--- /dev/null
++++ b/modules/hw/mmal/blend_rgba_neon.S
+@@ -0,0 +1,197 @@
++ .syntax unified
++ .arm
++// .thumb
++ .text
++ .align 16
++ .arch armv7-a
++ .fpu neon-vfpv4
++
++@ blend_rgbx_rgba_neon
++
++@ Implements /255 as ((x * 257) + 0x8000) >> 16
++@ This generates something in the range [(x+126)/255, (x+127)/255] which is good enough
++
++@ There is advantage to aligning src and/or dest - dest gives a bit more due to being used twice
++
++
++
++@ [r0] RGBx dest loaded into d20-d23
++@ [r1] RGBA src merge loaded into d16-d19
++@ r2 plane alpha
++@ r3 count (pixels)
++
++.macro blend_main sR, sG, sB, sA, dR, dG, dB, dA
++
++ push { r4, lr }
++
++ vdup.u8 d7, r2
++
++ subs r3, #8
++ vmov.u8 d6, #0xff
++
++ blt 2f
++
++ @ If < 16 bytes to move then don't bother trying to align
++ @ (a) This means the the align doesn't need to worry about r3 underflow
++ @ (b) The overhead would be greater than any gain
++ cmp r3, #8
++ mov r4, r3
++ ble 1f
++
++ @ Align r1 on a 32 byte boundary
++ neg r3, r0
++ ubfx r3, r3, #2, #3
++
++ cmp r3, #0
++ blne 10f
++
++ sub r3, r4, r3
++
++1:
++ vld4.8 {d16, d17, d18, d19}, [r1]
++
++1:
++ vmull.u8 q15, \sA, d7
++
++ vld4.8 {d20, d21, d22, d23}, [r0]
++
++ vsra.u16 q15, q15, #8
++ subs r3, #8
++ vrshrn.u16 d31, q15, #8
++ vsub.u8 d30, d6, d31
++
++ vmull.u8 q12, \sR, d31
++ vmull.u8 q13, \sG, d31
++ vmull.u8 q14, \sB, d31
++ addge r1, #32
++
++ vmlal.u8 q12, \dR, d30
++ vmlal.u8 q13, \dG, d30
++ vmlal.u8 q14, \dB, d30
++ vld4.8 {d16, d17, d18, d19}, [r1]
++
++ vsra.u16 q12, q12, #8 @ * 257/256
++ vsra.u16 q13, q13, #8
++ vsra.u16 q14, q14, #8
++
++ vrshrn.u16 \dR, q12, #8
++ vrshrn.u16 \dG, q13, #8
++ vrshrn.u16 \dB, q14, #8
++ vmov.u8 \dA, #0xff
++
++ vst4.8 {d20, d21, d22, d23}, [r0]!
++ bge 1b
++ add r1, #32
++
++2:
++ cmp r3, #-8
++ blgt 10f
++
++ pop { r4, pc }
++
++
++// Partial version
++// Align @ start & deal with tail
++10:
++ lsls r2, r3, #30 @ b2 -> C, b1 -> N
++ mov r2, r0
++ bcc 1f
++ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r1]!
++ vld4.8 {d20[0], d21[0], d22[0], d23[0]}, [r2]!
++ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1]!
++ vld4.8 {d20[1], d21[1], d22[1], d23[1]}, [r2]!
++ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r1]!
++ vld4.8 {d20[2], d21[2], d22[2], d23[2]}, [r2]!
++ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r1]!
++ vld4.8 {d20[3], d21[3], d22[3], d23[3]}, [r2]!
++1:
++ bpl 1f
++ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r1]!
++ vld4.8 {d20[4], d21[4], d22[4], d23[4]}, [r2]!
++ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r1]!
++ vld4.8 {d20[5], d21[5], d22[5], d23[5]}, [r2]!
++1:
++ tst r3, #1
++ beq 1f
++ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r1]!
++ vld4.8 {d20[6], d21[6], d22[6], d23[6]}, [r2]!
++1:
++ @ Set conditions for later
++ lsls r2, r3, #30 @ b2 -> C, b1 -> N
++
++ vmull.u8 q15, \sA, d7
++ vsra.u16 q15, q15, #8
++ vrshrn.u16 d31, q15, #8
++ vsub.u8 d30, d6, d31
++
++ vmull.u8 q12, \sR, d31
++ vmull.u8 q13, \sG, d31
++ vmull.u8 q14, \sB, d31
++
++ vmlal.u8 q12, \dR, d30
++ vmlal.u8 q13, \dG, d30
++ vmlal.u8 q14, \dB, d30
++
++ vsra.u16 q12, q12, #8
++ vsra.u16 q13, q13, #8
++ vsra.u16 q14, q14, #8
++
++ vrshrn.u16 \dR, q12, #8
++ vrshrn.u16 \dG, q13, #8
++ vrshrn.u16 \dB, q14, #8
++ vmov.u8 \dA, #0xff
++
++ bcc 1f
++ vst4.8 {d20[0], d21[0], d22[0], d23[0]}, [r0]!
++ vst4.8 {d20[1], d21[1], d22[1], d23[1]}, [r0]!
++ vst4.8 {d20[2], d21[2], d22[2], d23[2]}, [r0]!
++ vst4.8 {d20[3], d21[3], d22[3], d23[3]}, [r0]!
++1:
++ bpl 1f
++ vst4.8 {d20[4], d21[4], d22[4], d23[4]}, [r0]!
++ vst4.8 {d20[5], d21[5], d22[5], d23[5]}, [r0]!
++1:
++ tst r3, #1
++ bxeq lr
++ vst4.8 {d20[6], d21[6], d22[6], d23[6]}, [r0]!
++
++ bx lr
++
++.endm
++
++
++@ [r0] RGBx dest (Byte order: R, G, B, x)
++@ [r1] RGBA src merge (Byte order: R, G, B, A)
++@ r2 plane alpha
++@ r3 count (pixels)
++
++@ Whilst specified as RGBx+RGBA the only important part is the position of
++@ alpha, the other components are all treated the same
++
++@ [r0] RGBx dest (Byte order: R, G, B, x)
++@ [r1] RGBA src merge (Byte order: R, G, B, A) - same as above
++@ r2 plane alpha
++@ r3 count (pixels)
++ .align 16
++ .global blend_rgbx_rgba_neon
++#ifdef __ELF__
++ .type blend_rgbx_rgba_neon, %function
++#endif
++blend_rgbx_rgba_neon:
++ blend_main d16, d17, d18, d19, d20, d21, d22, d23
++
++
++@ [r0] RGBx dest (Byte order: R, G, B, x)
++@ [r1] RGBA src merge (Byte order: B, G, R, A) - B / R swapped
++@ r2 plane alpha
++@ r3 count (pixels)
++ .align 16
++ .global blend_bgrx_rgba_neon
++#ifdef __ELF__
++ .type blend_bgrx_rgba_neon, %function
++#endif
++blend_bgrx_rgba_neon:
++ blend_main d18, d17, d16, d19, d20, d21, d22, d23
++
++
++
+--- /dev/null
++++ b/modules/hw/mmal/blend_rgba_neon.h
+@@ -0,0 +1,17 @@
++#ifndef HW_MMAL_BLEND_RGBA_NEON_H
++#define HW_MMAL_BLEND_RGBA_NEON_H
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++typedef void blend_neon_fn(void * dest, const void * src, int alpha, unsigned int n);
++extern blend_neon_fn blend_rgbx_rgba_neon;
++extern blend_neon_fn blend_bgrx_rgba_neon;
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/blend_test.c
+@@ -0,0 +1,180 @@
++#include <stdio.h>
++#include <stdint.h>
++#include <memory.h>
++
++#include "blend_rgba_neon.h"
++
++#define RPI_PROFILE 1
++#define RPI_PROC_ALLOC 1
++#include "rpi_prof.h"
++
++static inline unsigned div255(unsigned v)
++{
++ // This models what we we do in the asm for / 255
++ // It generates something in the range [(i+126)/255, (i+127)/255] which is good enough
++ return ((v * 257) + 0x8000) >> 16;
++}
++
++static inline unsigned int a_merge(unsigned int dst, unsigned src, unsigned f)
++{
++ return div255((255 - f) * (dst) + src * f);
++}
++
++
++static void merge_line(void * dest, const void * src, int alpha, unsigned int n)
++{
++ unsigned int i;
++ const uint8_t * s_data = src;
++ uint8_t * d_data = dest;
++
++ for (i = 0; i != n; ++i) {
++ const uint32_t s_pel = ((const uint32_t *)s_data)[i];
++ const uint32_t d_pel = ((const uint32_t *)d_data)[i];
++ const unsigned int a = div255(alpha * (s_pel >> 24));
++ ((uint32_t *)d_data)[i] = 0xff000000 |
++ (a_merge((d_pel >> 16) & 0xff, (s_pel >> 16) & 0xff, a) << 16) |
++ (a_merge((d_pel >> 8) & 0xff, (s_pel >> 8) & 0xff, a) << 8 ) |
++ (a_merge((d_pel >> 0) & 0xff, (s_pel >> 0) & 0xff, a) << 0 );
++ }
++}
++
++
++// Merge RGBA with BGRA
++static void merge_line2(void * dest, const void * src, int alpha, unsigned int n)
++{
++ unsigned int i;
++ const uint8_t * s_data = src;
++ uint8_t * d_data = dest;
++
++ for (i = 0; i != n; ++i) {
++ const uint32_t s_pel = ((const uint32_t *)s_data)[i];
++ const uint32_t d_pel = ((const uint32_t *)d_data)[i];
++ const unsigned int a = div255(alpha * (s_pel >> 24));
++ ((uint32_t *)d_data)[i] = 0xff000000 |
++ (a_merge((d_pel >> 0) & 0xff, (s_pel >> 16) & 0xff, a) << 0 ) |
++ (a_merge((d_pel >> 8) & 0xff, (s_pel >> 8) & 0xff, a) << 8 ) |
++ (a_merge((d_pel >> 16) & 0xff, (s_pel >> 0) & 0xff, a) << 16);
++ }
++}
++
++#define BUF_SIZE 256
++#define BUF_SLACK 16
++#define BUF_ALIGN 64
++#define BUF_ALLOC (BUF_SIZE + 2*BUF_SLACK + BUF_ALIGN)
++
++static void test_line(const uint32_t * const dx, const unsigned int d_off,
++ const uint32_t * const sx, const unsigned int s_off,
++ const unsigned int alpha, const unsigned int len, const int prof_no)
++{
++ uint32_t d0_buf[BUF_ALLOC];
++ uint32_t d1_buf[BUF_ALLOC];
++ const uint32_t * const s0 = sx + s_off;
++
++ uint32_t * const d0 = (uint32_t *)(((uintptr_t)d0_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ uint32_t * const d1 = (uint32_t *)(((uintptr_t)d1_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ unsigned int i;
++
++ memcpy(d0, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++ memcpy(d1, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++
++ merge_line(d0 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++
++ PROFILE_START();
++ blend_rgbx_rgba_neon(d1 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++ PROFILE_ACC_N(prof_no);
++
++ for (i = 0; i != BUF_SIZE + BUF_SLACK*2; ++i) {
++ if (d0[i] != d1[i]) {
++ printf("%3d: %08x + %08x * %02x: %08x / %08x: len=%d\n", (int)(i - BUF_SLACK), dx[i], s0[i], alpha, d0[i], d1[i], len);
++ }
++ }
++}
++
++static void test_line2(const uint32_t * const dx, const unsigned int d_off,
++ const uint32_t * const sx, const unsigned int s_off,
++ const unsigned int alpha, const unsigned int len, const int prof_no)
++{
++ uint32_t d0_buf[BUF_ALLOC];
++ uint32_t d1_buf[BUF_ALLOC];
++ const uint32_t * const s0 = sx + s_off;
++
++ uint32_t * const d0 = (uint32_t *)(((uintptr_t)d0_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ uint32_t * const d1 = (uint32_t *)(((uintptr_t)d1_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ unsigned int i;
++
++ memcpy(d0, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++ memcpy(d1, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++
++ merge_line2(d0 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++
++ PROFILE_START();
++ blend_bgrx_rgba_neon(d1 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++ PROFILE_ACC_N(prof_no);
++
++ for (i = 0; i != BUF_SIZE + BUF_SLACK*2; ++i) {
++ if (d0[i] != d1[i]) {
++ printf("%3d: %08x + %08x * %02x: %08x / %08x: len=%d\n", (int)(i - BUF_SLACK), dx[i], s0[i], alpha, d0[i], d1[i], len);
++ }
++ }
++}
++
++
++
++int main(int argc, char *argv[])
++{
++ unsigned int i, j;
++ uint32_t d0_buf[BUF_ALLOC];
++ uint32_t s0_buf[BUF_ALLOC];
++
++ uint32_t * const d0 = (uint32_t *)(((uintptr_t)d0_buf + 63) & ~63) + 0;
++ uint32_t * const s0 = (uint32_t *)(((uintptr_t)s0_buf + 63) & ~63) + 0;
++
++ PROFILE_INIT();
++
++ for (i = 0; i != 255*255; ++i) {
++ unsigned int a = div255(i);
++ unsigned int b = (i + 127)/255;
++ unsigned int c = (i + 126)/255;
++ if (a != b && a != c)
++ printf("%d/255: %d != %d/%d\n", i, a, b, c);
++ }
++
++ for (i = 0; i != BUF_ALLOC; ++i) {
++ d0_buf[i] = 0xff00 | i;
++ s0_buf[i] = (i << 24) | 0x40ffc0;
++ }
++
++ for (i = 0; i != 256; ++i) {
++ test_line(d0, 0, s0, 0, i, 256, -1);
++ }
++ for (i = 0; i != 256; ++i) {
++ test_line(d0, 0, s0, 0, 128, i, -1);
++ }
++
++ for (j = 0; j != 16; ++j) {
++ for (i = 0; i != 256; ++i) {
++ test_line(d0, j & 3, s0, j >> 2, i, 256, j);
++ }
++ PROFILE_PRINTF_N(j);
++ PROFILE_CLEAR_N(j);
++ }
++ printf("Done 1\n");
++
++ for (i = 0; i != 256; ++i) {
++ test_line2(d0, 0, s0, 0, i, 256, -1);
++ }
++ for (i = 0; i != 256; ++i) {
++ test_line2(d0, 0, s0, 0, 128, i, -1);
++ }
++
++ for (j = 0; j != 16; ++j) {
++ for (i = 0; i != 256; ++i) {
++ test_line2(d0, j & 3, s0, j >> 2, i, 256, j);
++ }
++ PROFILE_PRINTF_N(j);
++ }
++ printf("Done 2\n");
++
++ return 0;
++}
++
+--- a/modules/hw/mmal/codec.c
++++ b/modules/hw/mmal/codec.c
+@@ -26,267 +26,443 @@
+ #include "config.h"
+ #endif
+
++#include <stdatomic.h>
++
+ #include <vlc_common.h>
+-#include <vlc_atomic.h>
+ #include <vlc_plugin.h>
+ #include <vlc_codec.h>
++#include <vlc_filter.h>
+ #include <vlc_threads.h>
+
+-#include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/util/mmal_util.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+
++#include <interface/vcsm/user-vcsm.h>
++
++#include "mmal_cma.h"
+ #include "mmal_picture.h"
+
++#include "subpic.h"
++#include "blend_rgba_neon.h"
++
++#define TRACE_ALL 0
++
++#define OPT_TO_FROM_ZC 0
++
+ /*
+ * This seems to be a bit high, but reducing it causes instabilities
+ */
+ #define NUM_EXTRA_BUFFERS 5
++//#define NUM_EXTRA_BUFFERS 10
+ #define NUM_DECODER_BUFFER_HEADERS 30
+
+-#define MIN_NUM_BUFFERS_IN_TRANSIT 2
++#define CONVERTER_BUFFERS 4 // Buffers on the output of the converter
++
++#define MMAL_SLICE_HEIGHT 16
++#define MMAL_ALIGN_W 32
++#define MMAL_ALIGN_H 16
+
+ #define MMAL_OPAQUE_NAME "mmal-opaque"
+ #define MMAL_OPAQUE_TEXT N_("Decode frames directly into RPI VideoCore instead of host memory.")
+ #define MMAL_OPAQUE_LONGTEXT N_("Decode frames directly into RPI VideoCore instead of host memory. This option must only be used with the MMAL video output plugin.")
+
+-static int OpenDecoder(decoder_t *dec);
+-static void CloseDecoder(decoder_t *dec);
+-
+-vlc_module_begin()
+- set_shortname(N_("MMAL decoder"))
+- set_description(N_("MMAL-based decoder plugin for Raspberry Pi"))
+- set_capability("video decoder", 90)
+- add_shortcut("mmal_decoder")
+- add_bool(MMAL_OPAQUE_NAME, true, MMAL_OPAQUE_TEXT, MMAL_OPAQUE_LONGTEXT, false)
+- set_callbacks(OpenDecoder, CloseDecoder)
+-vlc_module_end()
++#define MMAL_RESIZE_NAME "mmal-resize"
++#define MMAL_RESIZE_TEXT N_("Use mmal resizer rather than hvs.")
++#define MMAL_RESIZE_LONGTEXT N_("Use mmal resizer rather than isp. This uses less gpu memory than the ISP but is slower.")
++
++#define MMAL_ISP_NAME "mmal-isp"
++#define MMAL_ISP_TEXT N_("Use mmal isp rather than hvs.")
++#define MMAL_ISP_LONGTEXT N_("Use mmal isp rather than hvs. This may be faster but has no blend.")
+
+-struct decoder_sys_t {
+- bool opaque;
++typedef struct decoder_sys_t
++{
+ MMAL_COMPONENT_T *component;
+ MMAL_PORT_T *input;
+ MMAL_POOL_T *input_pool;
+ MMAL_PORT_T *output;
+- MMAL_POOL_T *output_pool; /* only used for non-opaque mode */
++ hw_mmal_port_pool_ref_t *ppr;
+ MMAL_ES_FORMAT_T *output_format;
+- vlc_sem_t sem;
+
++ MMAL_STATUS_T err_stream;
+ bool b_top_field_first;
+ bool b_progressive;
+
++ bool b_flushed;
++
++ vcsm_init_type_t vcsm_init_type;
++
++ // Lock to avoid pic update & allocate happenening simultainiously
++ // * We should be able to arrange life s.t. this isn't needed
++ // but while we are confused apply belt & braces
++ vlc_mutex_t pic_lock;
++
+ /* statistics */
+- int output_in_transit;
+- int input_in_transit;
+ atomic_bool started;
+-};
++} decoder_sys_t;
+
+-/* Utilities */
+-static int change_output_format(decoder_t *dec);
+-static int send_output_buffer(decoder_t *dec);
+-static void fill_output_port(decoder_t *dec);
+-
+-/* VLC decoder callback */
+-static int decode(decoder_t *dec, block_t *block);
+-static void flush_decoder(decoder_t *dec);
+-
+-/* MMAL callbacks */
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+
+-static int OpenDecoder(decoder_t *dec)
+-{
+- int ret = VLC_SUCCESS;
+- decoder_sys_t *sys;
+- MMAL_PARAMETER_UINT32_T extra_buffers;
+- MMAL_STATUS_T status;
++typedef struct supported_mmal_enc_s {
++ struct {
++ MMAL_PARAMETER_HEADER_T header;
++ MMAL_FOURCC_T encodings[64];
++ } supported;
++ int n;
++} supported_mmal_enc_t;
++
++#define SUPPORTED_MMAL_ENC_INIT \
++{ \
++ {{MMAL_PARAMETER_SUPPORTED_ENCODINGS, sizeof(((supported_mmal_enc_t *)0)->supported)}, {0}}, \
++ -1 \
++}
+
+- if (dec->fmt_in.i_codec != VLC_CODEC_MPGV &&
+- dec->fmt_in.i_codec != VLC_CODEC_H264)
+- return VLC_EGENERIC;
++static supported_mmal_enc_t supported_decode_in_enc = SUPPORTED_MMAL_ENC_INIT;
+
+- sys = calloc(1, sizeof(decoder_sys_t));
+- if (!sys) {
+- ret = VLC_ENOMEM;
+- goto out;
++static bool is_enc_supported(supported_mmal_enc_t * const support, const MMAL_FOURCC_T fcc)
++{
++ int i;
++
++ if (fcc == 0)
++ return false;
++ if (support->n == -1)
++ return true; // Unknown - say OK
++ for (i = 0; i < support->n; ++i) {
++ if (support->supported.encodings[i] == fcc)
++ return true;
+ }
+- dec->p_sys = sys;
++ return false;
++}
+
+- sys->opaque = var_InheritBool(dec, MMAL_OPAQUE_NAME);
+- bcm_host_init();
++static bool set_and_test_enc_supported(supported_mmal_enc_t * const support, MMAL_PORT_T * port, const MMAL_FOURCC_T fcc)
++{
++ if (support->n >= 0)
++ /* already done */;
++ else if (mmal_port_parameter_get(port, (MMAL_PARAMETER_HEADER_T *)&support->supported) != MMAL_SUCCESS)
++ support->n = 0;
++ else
++ support->n = (support->supported.header.size - sizeof(support->supported.header)) /
++ sizeof(support->supported.encodings[0]);
+
+- status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, &sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++ return is_enc_supported(support, fcc);
++}
+
+- sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
+- status = mmal_port_enable(sys->component->control, control_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable control port %s (status=%"PRIx32" %s)",
+- sys->component->control->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++static MMAL_FOURCC_T vlc_to_mmal_es_fourcc(const unsigned int fcc)
++{
++ switch (fcc){
++ case VLC_CODEC_MJPG:
++ return MMAL_ENCODING_MJPEG;
++ case VLC_CODEC_MP1V:
++ return MMAL_ENCODING_MP1V;
++ case VLC_CODEC_MPGV:
++ case VLC_CODEC_MP2V:
++ return MMAL_ENCODING_MP2V;
++ case VLC_CODEC_H263:
++ return MMAL_ENCODING_H263;
++ case VLC_CODEC_MP4V:
++ return MMAL_ENCODING_MP4V;
++ case VLC_CODEC_H264:
++ return MMAL_ENCODING_H264;
++ case VLC_CODEC_VP6:
++ return MMAL_ENCODING_VP6;
++ case VLC_CODEC_VP8:
++ return MMAL_ENCODING_VP8;
++ case VLC_CODEC_WMV1:
++ return MMAL_ENCODING_WMV1;
++ case VLC_CODEC_WMV2:
++ return MMAL_ENCODING_WMV2;
++ case VLC_CODEC_WMV3:
++ return MMAL_ENCODING_WMV3;
++ case VLC_CODEC_VC1:
++ return MMAL_ENCODING_WVC1;
++ case VLC_CODEC_THEORA:
++ return MMAL_ENCODING_THEORA;
++ default:
++ break;
+ }
++ return 0;
++}
+
+- sys->input = sys->component->input[0];
+- sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
+- if (dec->fmt_in.i_codec == VLC_CODEC_MPGV)
+- sys->input->format->encoding = MMAL_ENCODING_MP2V;
+- else
+- sys->input->format->encoding = MMAL_ENCODING_H264;
++static MMAL_FOURCC_T pic_to_slice_mmal_fourcc(const MMAL_FOURCC_T fcc)
++{
++ switch (fcc){
++ case MMAL_ENCODING_I420:
++ return MMAL_ENCODING_I420_SLICE;
++ case MMAL_ENCODING_I422:
++ return MMAL_ENCODING_I422_SLICE;
++ case MMAL_ENCODING_ARGB:
++ return MMAL_ENCODING_ARGB_SLICE;
++ case MMAL_ENCODING_RGBA:
++ return MMAL_ENCODING_RGBA_SLICE;
++ case MMAL_ENCODING_ABGR:
++ return MMAL_ENCODING_ABGR_SLICE;
++ case MMAL_ENCODING_BGRA:
++ return MMAL_ENCODING_BGRA_SLICE;
++ case MMAL_ENCODING_RGB16:
++ return MMAL_ENCODING_RGB16_SLICE;
++ case MMAL_ENCODING_RGB24:
++ return MMAL_ENCODING_RGB24_SLICE;
++ case MMAL_ENCODING_RGB32:
++ return MMAL_ENCODING_RGB32_SLICE;
++ case MMAL_ENCODING_BGR16:
++ return MMAL_ENCODING_BGR16_SLICE;
++ case MMAL_ENCODING_BGR24:
++ return MMAL_ENCODING_BGR24_SLICE;
++ case MMAL_ENCODING_BGR32:
++ return MMAL_ENCODING_BGR32_SLICE;
++ default:
++ break;
++ }
++ return 0;
++}
+
+- if (dec->fmt_in.i_codec == VLC_CODEC_H264) {
+- if (dec->fmt_in.i_extra > 0) {
+- status = mmal_format_extradata_alloc(sys->input->format,
+- dec->fmt_in.i_extra);
+- if (status == MMAL_SUCCESS) {
+- memcpy(sys->input->format->extradata, dec->fmt_in.p_extra,
+- dec->fmt_in.i_extra);
+- sys->input->format->extradata_size = dec->fmt_in.i_extra;
+- } else {
+- msg_Err(dec, "Failed to allocate extra format data on input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- }
++#define DEBUG_SQUARES 0
++#if DEBUG_SQUARES
++static void draw_square(void * pic_buf, size_t pic_stride, unsigned int x, unsigned int y, unsigned int w, unsigned int h, uint32_t val)
++{
++ uint32_t * p = (uint32_t *)pic_buf + y * pic_stride + x;
++ unsigned int i;
++ for (i = 0; i != h; ++i) {
++ unsigned int j;
++ for (j = 0; j != w; ++j) {
++ p[j] = val;
+ }
++ p += pic_stride;
+ }
++}
++#endif
+
+- status = mmal_port_format_commit(sys->input);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if 0
++static inline void draw_line(void * pic_buf, size_t pic_stride, unsigned int x, unsigned int y, unsigned int len, int inc)
++{
++ uint32_t * p = (uint32_t *)pic_buf + y * pic_stride + x;
++ while (len-- != 0) {
++ *p = ~0U;
++ p += inc;
+ }
+- sys->input->buffer_size = sys->input->buffer_size_recommended;
+- sys->input->buffer_num = sys->input->buffer_num_recommended;
++}
+
+- status = mmal_port_enable(sys->input, input_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
+
+- sys->output = sys->component->output[0];
+- sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
++static void draw_corners(void * pic_buf, size_t pic_stride, unsigned int x, unsigned int y, unsigned int w, unsigned int h)
++{
++ const unsigned int len = 20;
++ draw_line(pic_buf, pic_stride, x, y, len, 1);
++ draw_line(pic_buf, pic_stride, x, y, len, pic_stride);
++ draw_line(pic_buf, pic_stride, x + w - 1, y, len, -1);
++ draw_line(pic_buf, pic_stride, x + w - 1, y, len, pic_stride);
++ draw_line(pic_buf, pic_stride, x + w - 1, y + h - 1, len, -1);
++ draw_line(pic_buf, pic_stride, x + w - 1, y + h - 1, len, -(int)pic_stride);
++ draw_line(pic_buf, pic_stride, x, y + h - 1, len, 1);
++ draw_line(pic_buf, pic_stride, x, y + h - 1, len, -(int)pic_stride);
++}
++#endif
+
+- if (sys->opaque) {
+- extra_buffers.hdr.id = MMAL_PARAMETER_EXTRA_BUFFERS;
+- extra_buffers.hdr.size = sizeof(MMAL_PARAMETER_UINT32_T);
+- extra_buffers.value = NUM_EXTRA_BUFFERS;
+- status = mmal_port_parameter_set(sys->output, &extra_buffers.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to set MMAL_PARAMETER_EXTRA_BUFFERS on output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++static MMAL_RATIONAL_T
++rationalize_sar(unsigned int num, unsigned int den)
++{
++ static const unsigned int primes[] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 0};
++ const unsigned int * p = primes;
+
+- msg_Dbg(dec, "Activate zero-copy for output port");
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++ // If either num or den is 0 then return a well formed "unknown"
++ if (num == 0 || den == 0) {
++ return (MMAL_RATIONAL_T){.num = 0, .den = 0};
++ }
+
+- status = mmal_port_parameter_set(sys->output, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- goto out;
++ while (*p != 0 && num >= *p && den >= *p) {
++ if (num % *p != 0 || den % *p != 0)
++ ++p;
++ else {
++ num /= *p;
++ den /= *p;
+ }
+ }
++ return (MMAL_RATIONAL_T){.num = num, .den = den};
++}
+
+- status = mmal_port_enable(sys->output, output_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable output port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++// Buffer either attached to pic or released
++static picture_t * alloc_opaque_pic(decoder_t * const dec, MMAL_BUFFER_HEADER_T * const buf)
++{
++ decoder_sys_t *const dec_sys = dec->p_sys;
+
+- status = mmal_component_enable(sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable component %s (status=%"PRIx32" %s)",
+- sys->component->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ vlc_mutex_lock(&dec_sys->pic_lock);
++ picture_t * const pic = decoder_NewPicture(dec);
++ vlc_mutex_unlock(&dec_sys->pic_lock);
++
++ if (pic == NULL)
++ goto fail1;
++
++ if (buf->length == 0) {
++ msg_Err(dec, "%s: Empty buffer", __func__);
++ goto fail2;
+ }
+
+- sys->input_pool = mmal_pool_create(sys->input->buffer_num, 0);
++ if ((pic->context = hw_mmal_gen_context(buf, dec_sys->ppr)) == NULL)
++ goto fail2;
+
+- if (sys->opaque) {
+- dec->fmt_out.i_codec = VLC_CODEC_MMAL_OPAQUE;
+- dec->fmt_out.video.i_chroma = VLC_CODEC_MMAL_OPAQUE;
+- } else {
+- dec->fmt_out.i_codec = VLC_CODEC_I420;
+- dec->fmt_out.video.i_chroma = VLC_CODEC_I420;
++ buf_to_pic_copy_props(pic, buf);
++
++#if TRACE_ALL
++ msg_Dbg(dec, "pic: prog=%d, tff=%d, date=%lld", pic->b_progressive, pic->b_top_field_first, (long long)pic->date);
++#endif
++
++ return pic;
++
++fail2:
++ picture_Release(pic);
++fail1:
++ // Recycle rather than release to avoid buffer starvation if NewPic fails
++ hw_mmal_port_pool_ref_recycle(dec_sys->ppr, buf);
++ return NULL;
++}
++
++static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ decoder_t *dec = (decoder_t *)port->userdata;
++ MMAL_STATUS_T status;
++
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s: cmd=%d, data=%p", __func__, buffer->cmd, buffer->data);
++#endif
++
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ dec->p_sys->err_stream = status;
++ msg_Err(dec, "MMAL error %"PRIx32" \"%s\"", status,
++ mmal_status_to_string(status));
+ }
+
+- dec->pf_decode = decode;
+- dec->pf_flush = flush_decoder;
++ mmal_buffer_header_release(buffer);
++}
+
+- vlc_sem_init(&sys->sem, 0);
++static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ block_t * const block = (block_t *)buffer->user_data;
+
+-out:
+- if (ret != VLC_SUCCESS)
+- CloseDecoder(dec);
++ (void)port; // Unused
+
+- return ret;
++#if TRACE_ALL
++ msg_Dbg((decoder_t *)port->userdata, "<<< %s: cmd=%d, data=%p, len=%d/%d, pts=%lld", __func__,
++ buffer->cmd, buffer->data, buffer->length, buffer->alloc_size, (long long)buffer->pts);
++#endif
++
++ mmal_buffer_header_reset(buffer);
++ mmal_buffer_header_release(buffer);
++
++ if (block != NULL)
++ block_Release(block);
+ }
+
+-static void CloseDecoder(decoder_t *dec)
++static void decoder_output_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ decoder_t * const dec = (decoder_t *)port->userdata;
+
+- if (!sys)
++ if (buffer->cmd == 0 && buffer->length != 0)
++ {
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s: cmd=%d, data=%p, len=%d/%d, pts=%lld", __func__,
++ buffer->cmd, buffer->data, buffer->length, buffer->alloc_size, (long long)buffer->pts);
++#endif
++
++ picture_t *pic = alloc_opaque_pic(dec, buffer);
++#if TRACE_ALL
++ msg_Dbg(dec, "flags=%#x, video flags=%#x", buffer->flags, buffer->type->video.flags);
++#endif
++ if (pic == NULL)
++ msg_Err(dec, "Failed to allocate new picture");
++ else
++ decoder_QueueVideo(dec, pic);
++ // Buffer released or attached to pic - do not release again
+ return;
++ }
+
+- if (sys->component && sys->component->control->is_enabled)
+- mmal_port_disable(sys->component->control);
++ if (buffer->cmd == MMAL_EVENT_FORMAT_CHANGED)
++ {
++ decoder_sys_t * const sys = dec->p_sys;
++ MMAL_EVENT_FORMAT_CHANGED_T * const fmt = mmal_event_format_changed_get(buffer);
++ MMAL_ES_FORMAT_T * const format = mmal_format_alloc();
+
+- if (sys->input && sys->input->is_enabled)
+- mmal_port_disable(sys->input);
++ if (format == NULL)
++ msg_Err(dec, "Failed to allocate new format");
++ else
++ {
++ mmal_format_full_copy(format, fmt->format);
++ format->encoding = MMAL_ENCODING_OPAQUE;
+
+- if (sys->output && sys->output->is_enabled)
+- mmal_port_disable(sys->output);
++ // If no PAR in the stream - see if we've got one from the demux
++ if (format->es->video.par.den <= 0 || format->es->video.par.num <= 0) {
++ unsigned int n = dec->fmt_in.video.i_sar_num;
++ unsigned int d = dec->fmt_in.video.i_sar_den;
++
++ if (n == 0 || d == 0) {
++ // Guesswork required
++ const unsigned int w = format->es->video.width;
++ const unsigned int h = format->es->video.height;
++ if ((w == 704 || w == 720) && (h == 480 || h == 576)) {
++ // Very likely SD 4:3
++ n = w * 3;
++ d = h * 4;
++ }
++ else
++ {
++ // Otherwise guess SAR 1:1
++ n = 1;
++ d = 1;
++ }
++ }
+
+- if (sys->component && sys->component->is_enabled)
+- mmal_component_disable(sys->component);
++ format->es->video.par = rationalize_sar(n, d);
++ }
+
+- if (sys->input_pool)
+- mmal_pool_destroy(sys->input_pool);
++ if (sys->output_format != NULL)
++ mmal_format_free(sys->output_format);
+
+- if (sys->output_format)
+- mmal_format_free(sys->output_format);
++ sys->output_format = format;
++ }
++ }
++ else if (buffer->cmd != 0) {
++ char buf0[5];
++ msg_Warn(dec, "Unexpected output cb event: %s", str_fourcc(buf0, buffer->cmd));
++ }
+
+- if (sys->output_pool)
+- mmal_pool_destroy(sys->output_pool);
++ // If we get here then we were flushing (cmd == 0 && len == 0) or
++ // that was an EVENT - in either case we want to release the buffer
++ // back to its pool rather than recycle it.
++ mmal_buffer_header_reset(buffer);
++ buffer->user_data = NULL;
++ mmal_buffer_header_release(buffer);
++}
+
+- if (sys->component)
+- mmal_component_release(sys->component);
+
+- vlc_sem_destroy(&sys->sem);
+- free(sys);
+
+- bcm_host_deinit();
++static void fill_output_port(decoder_t *dec)
++{
++ decoder_sys_t *sys = dec->p_sys;
++
++ if (decoder_UpdateVideoFormat(dec) != 0)
++ {
++ // If we have a new format don't bother stuffing the buffer
++ // We should get a reset RSN
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: Updated", __func__);
++#endif
++
++ return;
++ }
++
++ hw_mmal_port_pool_ref_fill(sys->ppr);
++ return;
+ }
+
+ static int change_output_format(decoder_t *dec)
+ {
+ MMAL_PARAMETER_VIDEO_INTERLACE_TYPE_T interlace_type;
+- decoder_sys_t *sys = dec->p_sys;
++ decoder_sys_t * const sys = dec->p_sys;
+ MMAL_STATUS_T status;
+- int pool_size;
+ int ret = 0;
+
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: <<<", __func__);
++#endif
++
+ if (atomic_load(&sys->started)) {
+ mmal_format_full_copy(sys->output->format, sys->output_format);
+ status = mmal_port_format_commit(sys->output);
+@@ -300,7 +476,9 @@ static int change_output_format(decoder_
+ }
+
+ port_reset:
++#if TRACE_ALL
+ msg_Dbg(dec, "%s: Do full port reset", __func__);
++#endif
+ status = mmal_port_disable(sys->output);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to disable output port (status=%"PRIx32" %s)",
+@@ -310,6 +488,7 @@ port_reset:
+ }
+
+ mmal_format_full_copy(sys->output->format, sys->output_format);
++
+ status = mmal_port_format_commit(sys->output);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to commit output format (status=%"PRIx32" %s)",
+@@ -318,18 +497,10 @@ port_reset:
+ goto out;
+ }
+
+- if (sys->opaque) {
+- sys->output->buffer_num = NUM_DECODER_BUFFER_HEADERS;
+- pool_size = NUM_DECODER_BUFFER_HEADERS;
+- } else {
+- sys->output->buffer_num = __MAX(sys->output->buffer_num_recommended,
+- MIN_NUM_BUFFERS_IN_TRANSIT);
+- pool_size = sys->output->buffer_num;
+- }
+-
++ sys->output->buffer_num = NUM_DECODER_BUFFER_HEADERS;
+ sys->output->buffer_size = sys->output->buffer_size_recommended;
+
+- status = mmal_port_enable(sys->output, output_port_cb);
++ status = mmal_port_enable(sys->output, decoder_output_cb);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to enable output port (status=%"PRIx32" %s)",
+ status, mmal_status_to_string(status));
+@@ -338,25 +509,14 @@ port_reset:
+ }
+
+ if (!atomic_load(&sys->started)) {
+- if (!sys->opaque) {
+- sys->output_pool = mmal_port_pool_create(sys->output, pool_size, 0);
+- msg_Dbg(dec, "Created output pool with %d pictures", sys->output_pool->headers_num);
+- }
+-
+ atomic_store(&sys->started, true);
+
+ /* we need one picture from vout for each buffer header on the output
+ * port */
+- dec->i_extra_picture_buffers = pool_size;
+-
+- /* remove what VLC core reserves as it is part of the pool_size
+- * already */
+- if (dec->fmt_in.i_codec == VLC_CODEC_H264)
+- dec->i_extra_picture_buffers -= 19;
+- else
+- dec->i_extra_picture_buffers -= 3;
+-
++ dec->i_extra_picture_buffers = 10;
++#if TRACE_ALL
+ msg_Dbg(dec, "Request %d extra pictures", dec->i_extra_picture_buffers);
++#endif
+ }
+
+ apply_fmt:
+@@ -366,8 +526,8 @@ apply_fmt:
+ dec->fmt_out.video.i_y_offset = sys->output->format->es->video.crop.y;
+ dec->fmt_out.video.i_visible_width = sys->output->format->es->video.crop.width;
+ dec->fmt_out.video.i_visible_height = sys->output->format->es->video.crop.height;
+- dec->fmt_out.video.i_sar_num = sys->output->format->es->video.par.num;
+- dec->fmt_out.video.i_sar_den = sys->output->format->es->video.par.den;
++ dec->fmt_out.video.i_sar_num = sys->output_format->es->video.par.num; // SAR can be killed by commit
++ dec->fmt_out.video.i_sar_den = sys->output_format->es->video.par.den;
+ dec->fmt_out.video.i_frame_rate = sys->output->format->es->video.frame_rate.num;
+ dec->fmt_out.video.i_frame_rate_base = sys->output->format->es->video.frame_rate.den;
+
+@@ -382,12 +542,19 @@ apply_fmt:
+ sys->b_progressive = (interlace_type.eMode == MMAL_InterlaceProgressive);
+ sys->b_top_field_first = sys->b_progressive ? true :
+ (interlace_type.eMode == MMAL_InterlaceFieldsInterleavedUpperFirst);
++#if TRACE_ALL
+ msg_Dbg(dec, "Detected %s%s video (%d)",
+ sys->b_progressive ? "progressive" : "interlaced",
+ sys->b_progressive ? "" : (sys->b_top_field_first ? " tff" : " bff"),
+ interlace_type.eMode);
++#endif
+ }
+
++ // Tell the rest of the world we have changed format
++ vlc_mutex_lock(&sys->pic_lock);
++ ret = decoder_UpdateVideoFormat(dec);
++ vlc_mutex_unlock(&sys->pic_lock);
++
+ out:
+ mmal_format_free(sys->output_format);
+ sys->output_format = NULL;
+@@ -395,144 +562,85 @@ out:
+ return ret;
+ }
+
+-static int send_output_buffer(decoder_t *dec)
++static MMAL_STATUS_T
++set_extradata_and_commit(decoder_t * const dec, decoder_sys_t * const sys)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
+- picture_sys_t *p_sys;
+- picture_t *picture = NULL;
+ MMAL_STATUS_T status;
+- unsigned buffer_size = 0;
+- int ret = 0;
+
+- if (!sys->output->is_enabled)
+- return VLC_EGENERIC;
+-
+- /* If local output pool is allocated, use it - this is only the case for
+- * non-opaque modes */
+- if (sys->output_pool) {
+- buffer = mmal_queue_get(sys->output_pool->queue);
+- if (!buffer) {
+- msg_Warn(dec, "Failed to get new buffer");
+- return VLC_EGENERIC;
+- }
+- }
+-
+- if (!decoder_UpdateVideoFormat(dec))
+- picture = decoder_NewPicture(dec);
+- if (!picture) {
+- msg_Warn(dec, "Failed to get new picture");
+- ret = -1;
+- goto err;
+- }
+-
+- p_sys = picture->p_sys;
+- for (int i = 0; i < picture->i_planes; i++)
+- buffer_size += picture->p[i].i_lines * picture->p[i].i_pitch;
+-
+- if (sys->output_pool) {
+- mmal_buffer_header_reset(buffer);
+- buffer->alloc_size = sys->output->buffer_size;
+- if (buffer_size < sys->output->buffer_size) {
+- msg_Err(dec, "Retrieved picture with too small data block (%d < %d)",
+- buffer_size, sys->output->buffer_size);
+- ret = VLC_EGENERIC;
+- goto err;
+- }
+-
+- if (!sys->opaque)
+- buffer->data = picture->p[0].p_pixels;
+- } else {
+- buffer = p_sys->buffer;
+- if (!buffer) {
+- msg_Warn(dec, "Picture has no buffer attached");
+- picture_Release(picture);
+- return VLC_EGENERIC;
+- }
+- buffer->data = p_sys->buffer->data;
+- }
+- buffer->user_data = picture;
+- buffer->cmd = 0;
+-
+- status = mmal_port_send_buffer(sys->output, buffer);
++ status = mmal_port_format_commit(sys->input);
+ if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to send buffer to output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- ret = -1;
+- goto err;
+- }
+- atomic_fetch_add(&sys->output_in_transit, 1);
+-
+- return ret;
+-
+-err:
+- if (picture)
+- picture_Release(picture);
+- if (sys->output_pool && buffer) {
+- buffer->data = NULL;
+- mmal_buffer_header_release(buffer);
++ msg_Err(dec, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
+ }
+- return ret;
++ return status;
+ }
+
+-static void fill_output_port(decoder_t *dec)
++static MMAL_STATUS_T decoder_send_extradata(decoder_t * const dec, decoder_sys_t *const sys)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+-
+- unsigned max_buffers_in_transit = 0;
+- int buffers_available = 0;
+- int buffers_to_send = 0;
+- int i;
++ if (dec->fmt_in.i_codec == VLC_CODEC_H264 &&
++ dec->fmt_in.i_extra > 0)
++ {
++ MMAL_BUFFER_HEADER_T * const buf = mmal_queue_wait(sys->input_pool->queue);
++ MMAL_STATUS_T status;
++
++ mmal_buffer_header_reset(buf);
++ buf->cmd = 0;
++ buf->user_data = NULL;
++ buf->alloc_size = sys->input->buffer_size;
++ buf->length = dec->fmt_in.i_extra;
++ buf->data = dec->fmt_in.p_extra;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_CONFIG;
+
+- if (sys->output_pool) {
+- max_buffers_in_transit = __MAX(sys->output_pool->headers_num,
+- MIN_NUM_BUFFERS_IN_TRANSIT);
+- buffers_available = mmal_queue_length(sys->output_pool->queue);
+- } else {
+- max_buffers_in_transit = NUM_DECODER_BUFFER_HEADERS;
+- buffers_available = NUM_DECODER_BUFFER_HEADERS - atomic_load(&sys->output_in_transit);
++ status = mmal_port_send_buffer(sys->input, buf);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to send extradata buffer to input port (status=%"PRIx32" %s)",
++ status, mmal_status_to_string(status));
++ return status;
++ }
+ }
+- buffers_to_send = max_buffers_in_transit - atomic_load(&sys->output_in_transit);
+
+- if (buffers_to_send > buffers_available)
+- buffers_to_send = buffers_available;
+-
+-#ifndef NDEBUG
+- msg_Dbg(dec, "Send %d buffers to output port (available: %d, "
+- "in_transit: %d, buffer_num: %d)",
+- buffers_to_send, buffers_available,
+- atomic_load(&sys->output_in_transit),
+- sys->output->buffer_num);
+-#endif
+- for (i = 0; i < buffers_to_send; ++i)
+- if (send_output_buffer(dec) < 0)
+- break;
++ return MMAL_SUCCESS;
+ }
+
+ static void flush_decoder(decoder_t *dec)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
+- MMAL_STATUS_T status;
++ decoder_sys_t *const sys = dec->p_sys;
+
+- msg_Dbg(dec, "Flushing decoder ports...");
+- mmal_port_flush(sys->output);
+- mmal_port_flush(sys->input);
+-
+- while (atomic_load(&sys->output_in_transit) ||
+- atomic_load(&sys->input_in_transit))
+- vlc_sem_wait(&sys->sem);
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: <<<", __func__);
++#endif
++
++ if (!sys->b_flushed) {
++ mmal_port_disable(sys->input);
++ mmal_port_disable(sys->output);
++ // We can leave the input disabled, but we want the output enabled
++ // in order to sink any buffers returning from other modules
++ mmal_port_enable(sys->output, decoder_output_cb);
++ sys->b_flushed = true;
++ }
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: >>>", __func__);
++#endif
+ }
+
+ static int decode(decoder_t *dec, block_t *block)
+ {
+ decoder_sys_t *sys = dec->p_sys;
+ MMAL_BUFFER_HEADER_T *buffer;
+- bool need_flush = false;
+ uint32_t len;
+- uint32_t flags = 0;
++ uint32_t flags = MMAL_BUFFER_HEADER_FLAG_FRAME_START;
+ MMAL_STATUS_T status;
+
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s: %lld/%lld", __func__, block == NULL ? -1LL : block->i_dts, block == NULL ? -1LL : block->i_pts);
++#endif
++
++ if (sys->err_stream != MMAL_SUCCESS) {
++ msg_Err(dec, "MMAL error reported by ctrl");
++ flush_decoder(dec);
++ return VLCDEC_ECRITICAL; /// I think they are all fatal
++ }
++
+ /*
+ * Configure output port if necessary
+ */
+@@ -541,18 +649,50 @@ static int decode(decoder_t *dec, block_
+ msg_Err(dec, "Failed to change output port format");
+ }
+
+- if (!block)
+- goto out;
++ if (block == NULL)
++ return VLCDEC_SUCCESS;
+
+ /*
+ * Check whether full flush is required
+ */
+- if (block && block->i_flags & BLOCK_FLAG_DISCONTINUITY) {
++ if (block->i_flags & BLOCK_FLAG_DISCONTINUITY) {
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: >>> Discontinuity", __func__);
++#endif
+ flush_decoder(dec);
++ }
++
++ if (block->i_buffer == 0)
++ {
+ block_Release(block);
+ return VLCDEC_SUCCESS;
+ }
+
++ // Reenable stuff if the last thing we did was flush
++ if (!sys->output->is_enabled &&
++ (status = mmal_port_enable(sys->output, decoder_output_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(dec, "Output port enable failed");
++ goto fail;
++ }
++
++ if (!sys->input->is_enabled)
++ {
++ if ((status = set_extradata_and_commit(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ if ((status = mmal_port_enable(sys->input, input_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(dec, "Input port enable failed");
++ goto fail;
++ }
++
++ if ((status = decoder_send_extradata(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++ }
++
++ // *** We cannot get a picture to put the result in 'till we have
++ // reported the size & the output stages have been set up
+ if (atomic_load(&sys->started))
+ fill_output_port(dec);
+
+@@ -563,18 +703,21 @@ static int decode(decoder_t *dec, block_
+ if (block->i_flags & BLOCK_FLAG_CORRUPTED)
+ flags |= MMAL_BUFFER_HEADER_FLAG_CORRUPTED;
+
+- while (block && block->i_buffer > 0) {
+- buffer = mmal_queue_timedwait(sys->input_pool->queue, 100);
++ while (block != NULL)
++ {
++ buffer = mmal_queue_wait(sys->input_pool->queue);
+ if (!buffer) {
+ msg_Err(dec, "Failed to retrieve buffer header for input data");
+- need_flush = true;
+- break;
++ goto fail;
+ }
++
+ mmal_buffer_header_reset(buffer);
+ buffer->cmd = 0;
+- buffer->pts = block->i_pts != 0 ? block->i_pts : block->i_dts;
++ buffer->pts = block->i_pts != VLC_TICK_INVALID ? block->i_pts :
++ block->i_dts != VLC_TICK_INVALID ? block->i_dts : MMAL_TIME_UNKNOWN;
+ buffer->dts = block->i_dts;
+ buffer->alloc_size = sys->input->buffer_size;
++ buffer->user_data = NULL;
+
+ len = block->i_buffer;
+ if (len > buffer->alloc_size)
+@@ -585,94 +728,1808 @@ static int decode(decoder_t *dec, block_
+ block->i_buffer -= len;
+ buffer->length = len;
+ if (block->i_buffer == 0) {
++ flags |= MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++ if (block->i_flags & BLOCK_FLAG_END_OF_SEQUENCE) {
++ msg_Dbg(dec, "EOS sent");
++ flags |= MMAL_BUFFER_HEADER_FLAG_EOS;
++ }
+ buffer->user_data = block;
+ block = NULL;
+ }
+ buffer->flags = flags;
+
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: -- Send buffer: cmd=%d, data=%p, size=%d, len=%d, offset=%d, flags=%#x, pts=%lld, dts=%lld", __func__,\
++ buffer->cmd, buffer->data, buffer->alloc_size, buffer->length, buffer->offset,
++ buffer->flags, (long long)buffer->pts, (long long)buffer->dts);
++#endif
+ status = mmal_port_send_buffer(sys->input, buffer);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to send buffer to input port (status=%"PRIx32" %s)",
+ status, mmal_status_to_string(status));
+- break;
++ goto fail;
+ }
+- atomic_fetch_add(&sys->input_in_transit, 1);
++
++ // Reset flushed flag once we have sent a buf
++ sys->b_flushed = false;
++ flags &= ~MMAL_BUFFER_HEADER_FLAG_FRAME_START;
+ }
++ return VLCDEC_SUCCESS;
+
+-out:
+- if (need_flush)
+- flush_decoder(dec);
++fail:
++ flush_decoder(dec);
++ return VLCDEC_ECRITICAL;
+
+- return VLCDEC_SUCCESS;
+ }
+
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++
++static void CloseDecoder(decoder_t *dec)
+ {
+- decoder_t *dec = (decoder_t *)port->userdata;
++ decoder_sys_t *sys = dec->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: <<<", __func__);
++#endif
++
++ if (!sys)
++ return;
++
++ if (sys->component != NULL) {
++ if (sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++ if (sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
++
++ if (sys->component->is_enabled)
++ mmal_component_disable(sys->component);
++
++ mmal_component_release(sys->component);
++ }
++
++ if (sys->input_pool != NULL)
++ mmal_pool_destroy(sys->input_pool);
++
++ if (sys->output_format != NULL)
++ mmal_format_free(sys->output_format);
++
++ hw_mmal_port_pool_ref_release(sys->ppr, false);
++
++ cma_vcsm_exit(sys->vcsm_init_type);
++
++ vlc_mutex_destroy(&sys->pic_lock);
++ free(sys);
++}
++
++static int OpenDecoder(decoder_t *dec)
++{
++ int ret = VLC_EGENERIC;
++ decoder_sys_t *sys;
+ MMAL_STATUS_T status;
++ const MMAL_FOURCC_T in_fcc = vlc_to_mmal_es_fourcc(dec->fmt_in.i_codec);
++
++#if TRACE_ALL || 1
++ {
++ char buf1[5], buf2[5], buf2a[5];
++ char buf3[5], buf4[5];
++ MMAL_RATIONAL_T r = rationalize_sar(dec->fmt_in.video.i_sar_num, dec->fmt_in.video.i_sar_den);
++
++ msg_Dbg(dec, "%s: <<< (%s/%s)[%s] %dx%d %d/%d=%d/%d o:%#x -> (%s/%s) %dx%d %d/%d o:%#x", __func__,
++ str_fourcc(buf1, dec->fmt_in.i_codec),
++ str_fourcc(buf2, dec->fmt_in.video.i_chroma),
++ str_fourcc(buf2a, in_fcc),
++ dec->fmt_in.video.i_width, dec->fmt_in.video.i_height,
++ dec->fmt_in.video.i_sar_num, dec->fmt_in.video.i_sar_den,
++ r.num, r.den,
++ (int)dec->fmt_in.video.orientation,
++ str_fourcc(buf3, dec->fmt_out.i_codec),
++ str_fourcc(buf4, dec->fmt_out.video.i_chroma),
++ dec->fmt_out.video.i_width, dec->fmt_out.video.i_height,
++ dec->fmt_out.video.i_sar_num, dec->fmt_out.video.i_sar_den,
++ (int)dec->fmt_out.video.orientation);
++ }
++#endif
++
++ if (!is_enc_supported(&supported_decode_in_enc, in_fcc))
++ return VLC_EGENERIC;
++
++ sys = calloc(1, sizeof(decoder_sys_t));
++ if (!sys) {
++ ret = VLC_ENOMEM;
++ goto fail;
++ }
++ dec->p_sys = sys;
++ vlc_mutex_init(&sys->pic_lock);
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(dec, "VCSM init failed");
++ goto fail;
++ }
++ msg_Info(dec, "VCSM init succeeded: %s", cma_vcsm_init_str(sys->vcsm_init_type));
++
++ sys->err_stream = MMAL_SUCCESS;
++
++ status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->input = sys->component->input[0];
++ sys->output = sys->component->output[0];
++
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
++ sys->input->format->encoding = in_fcc;
++
++ if (!set_and_test_enc_supported(&supported_decode_in_enc, sys->input, in_fcc)) {
++#if TRACE_ALL
++ char cbuf[5];
++ msg_Dbg(dec, "Format not supported: %s", str_fourcc(cbuf, in_fcc));
++#endif
++ goto fail;
++ }
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
++ status = mmal_port_enable(sys->component->control, control_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((status = set_extradata_and_commit(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ sys->input->buffer_num = sys->input->buffer_num_recommended;
++
++ status = mmal_port_enable(sys->input, input_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ // Set vanishingly unlikely shape (or at least crop)
++ // to ensure that we get a resolution changed event
++ // Small wxh are rejected (128x128 is rejected) so pick a
++ // plausible size.
++ // Crop doesn't seem to be checked for being constrained by wxh
++ // so we could place it outside the pic to be sure that it is
++ // never matched but stick with something legal in case it is ever
++ // actually checked
++ sys->output->format->es->video.height = 256;
++ sys->output->format->es->video.width = 256;
++ sys->output->format->es->video.crop.height = 4;
++ sys->output->format->es->video.crop.width = 2;
++ sys->output->format->es->video.crop.x = 66;
++ sys->output->format->es->video.crop.y = 88;
++
++ if ((status = hw_mmal_opaque_output(VLC_OBJECT(dec), &sys->ppr,
++ sys->output, NUM_EXTRA_BUFFERS, decoder_output_cb)) != MMAL_SUCCESS)
++ goto fail;
++
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((sys->input_pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(dec, "Failed to create input pool");
++ goto fail;
++ }
++
++ sys->b_flushed = true;
++
++ if ((status = decoder_send_extradata(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ // Given no better ideas at this point copy input format to output
++ // This also copies container stuff (such as orientation) that we do not
++ // decode from the ES but may be important to display
++ video_format_Copy(&dec->fmt_out.video, &dec->fmt_in.video);
++ dec->fmt_out.i_codec = VLC_CODEC_MMAL_OPAQUE;
++ dec->fmt_out.video.i_chroma = VLC_CODEC_MMAL_OPAQUE;
++
++
++ dec->pf_decode = decode;
++ dec->pf_flush = flush_decoder;
++
++#if TRACE_ALL
++ msg_Dbg(dec, ">>> %s: ok", __func__);
++#endif
++ return 0;
++
++fail:
++ CloseDecoder(dec);
++#if TRACE_ALL
++msg_Dbg(dec, ">>> %s: FAIL: ret=%d", __func__, ret);
++#endif
++ return ret;
++}
++
++// ----------------------------
++
++#define CONV_MAX_LATENCY 1 // In frames
++
++typedef struct pic_fifo_s {
++ picture_t * head;
++ picture_t * tail;
++} pic_fifo_t;
++
++static inline picture_t * pic_fifo_get(pic_fifo_t * const pf)
++{
++ picture_t * const pic = pf->head;;
++ if (pic != NULL) {
++ pf->head = pic->p_next;
++ pic->p_next = NULL;
++ }
++ return pic;
++}
++
++static inline picture_t * pic_fifo_get_all(pic_fifo_t * const pf)
++{
++ picture_t * const pic = pf->head;;
++ pf->head = NULL;
++ return pic;
++}
++
++static inline void pic_fifo_release_all(pic_fifo_t * const pf)
++{
++ picture_t * pic;
++ while ((pic = pic_fifo_get(pf)) != NULL) {
++ picture_Release(pic);
++ }
++}
++
++static inline void pic_fifo_init(pic_fifo_t * const pf)
++{
++ pf->head = NULL;
++ pf->tail = NULL; // Not strictly needed
++}
++
++static inline void pic_fifo_put(pic_fifo_t * const pf, picture_t * pic)
++{
++ pic->p_next = NULL;
++ if (pf->head == NULL)
++ pf->head = pic;
++ else
++ pf->tail->p_next = pic;
++ pf->tail = pic;
++}
++
++#define SUBS_MAX 3
++
++typedef enum filter_resizer_e {
++ FILTER_RESIZER_RESIZER,
++ FILTER_RESIZER_ISP,
++ FILTER_RESIZER_HVS
++} filter_resizer_t;
++
++typedef struct conv_frame_stash_s
++{
++ mtime_t pts;
++ MMAL_BUFFER_HEADER_T * sub_bufs[SUBS_MAX];
++} conv_frame_stash_t;
++
++typedef struct filter_sys_t {
++ filter_resizer_t resizer_type;
++ MMAL_COMPONENT_T *component;
++ MMAL_PORT_T *input;
++ MMAL_PORT_T *output;
++ MMAL_POOL_T *out_pool; // Free output buffers
++ MMAL_POOL_T *in_pool; // Input pool to get BH for replication
++
++ cma_buf_pool_t * cma_in_pool;
++ cma_buf_pool_t * cma_out_pool;
++
++ subpic_reg_stash_t subs[SUBS_MAX];
++
++ pic_fifo_t ret_pics;
++
++ unsigned int pic_n;
++ vlc_sem_t sem;
++ vlc_mutex_t lock;
++
++ MMAL_STATUS_T err_stream;
++
++ bool needs_copy_in;
++ bool is_cma;
++ bool is_sliced;
++ bool out_fmt_set;
++ const char * component_name;
++ MMAL_PORT_BH_CB_T in_port_cb_fn;
++ MMAL_PORT_BH_CB_T out_port_cb_fn;
++
++ uint64_t frame_seq;
++ conv_frame_stash_t stash[16];
++
++ // Slice specific tracking stuff
++ struct {
++ pic_fifo_t pics;
++ unsigned int line; // Lines filled
++ } slice;
++
++ vcsm_init_type_t vcsm_init_type;
++} filter_sys_t;
++
++
++static MMAL_STATUS_T pic_to_format(MMAL_ES_FORMAT_T * const es_fmt, const picture_t * const pic)
++{
++ unsigned int bpp = (pic->format.i_bits_per_pixel + 7) >> 3;
++ MMAL_VIDEO_FORMAT_T * const v_fmt = &es_fmt->es->video;
++
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = vlc_to_mmal_video_fourcc(&pic->format);
++ es_fmt->encoding_variant = 0;
++
++ // Fill in crop etc.
++ hw_mmal_vlc_fmt_to_mmal_fmt(es_fmt, &pic->format);
++ // Override width / height with strides if appropriate
++ if (bpp != 0) {
++ v_fmt->width = pic->p[0].i_pitch / bpp;
++ v_fmt->height = pic->p[0].i_lines;
++ }
++ return MMAL_SUCCESS;
++}
++
++
++static MMAL_STATUS_T conv_enable_in(filter_t * const p_filter, filter_sys_t * const sys)
++{
++ MMAL_STATUS_T err = MMAL_SUCCESS;
++
++ if (!sys->input->is_enabled &&
++ (err = mmal_port_enable(sys->input, sys->in_port_cb_fn)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, err, mmal_status_to_string(err));
++ }
++ return err;
++}
++
++static MMAL_STATUS_T conv_enable_out(filter_t * const p_filter, filter_sys_t * const sys)
++{
++ MMAL_STATUS_T err = MMAL_SUCCESS;
++
++ if (sys->is_cma)
++ {
++ if (sys->cma_out_pool == NULL &&
++ (sys->cma_out_pool = cma_buf_pool_new(CONVERTER_BUFFERS, CONVERTER_BUFFERS, true, "mmal_resizer")) == NULL)
++ {
++ msg_Err(p_filter, "Failed to alloc cma buf pool");
++ return MMAL_ENOMEM;
++ }
++ }
++ else
++ {
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++ }
++
++ if (!sys->output->is_enabled &&
++ (err = mmal_port_enable(sys->output, sys->out_port_cb_fn)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to enable output port %s (status=%"PRIx32" %s)",
++ sys->output->name, err, mmal_status_to_string(err));
++ }
++ return err;
++}
++
++static void conv_control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ filter_t * const p_filter = (filter_t *)port->userdata;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: <<< cmd=%d, data=%p, pic=%p", __func__, buffer->cmd, buffer->data, buffer->user_data);
++#endif
+
+ if (buffer->cmd == MMAL_EVENT_ERROR) {
+- status = *(uint32_t *)buffer->data;
+- msg_Err(dec, "MMAL error %"PRIx32" \"%s\"", status,
++ MMAL_STATUS_T status = *(uint32_t *)buffer->data;
++
++ p_filter->p_sys->err_stream = status;
++
++ msg_Err(p_filter, "MMAL error %"PRIx32" \"%s\"", status,
+ mmal_status_to_string(status));
+ }
+
+ mmal_buffer_header_release(buffer);
+ }
+
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++static void conv_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
+ {
+- block_t *block = (block_t *)buffer->user_data;
+- decoder_t *dec = (decoder_t *)port->userdata;
+- decoder_sys_t *sys = dec->p_sys;
+- buffer->user_data = NULL;
++#if TRACE_ALL
++ picture_context_t * ctx = buf->user_data;
++// filter_sys_t *const sys = ((filter_t *)port->userdata)->p_sys;
++
++ msg_Dbg((filter_t *)port->userdata, "<<< %s cmd=%d, ctx=%p, buf=%p, flags=%#x, len=%d/%d, pts=%lld",
++ __func__, buf->cmd, ctx, buf, buf->flags, buf->length, buf->alloc_size, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf);
++
++#if TRACE_ALL
++ msg_Dbg((filter_t *)port->userdata, ">>> %s", __func__);
++#endif
++}
++
++static void conv_out_q_pic(filter_sys_t * const sys, picture_t * const pic)
++{
++ pic->p_next = NULL;
++
++ vlc_mutex_lock(&sys->lock);
++ pic_fifo_put(&sys->ret_pics, pic);
++ vlc_mutex_unlock(&sys->lock);
+
+- mmal_buffer_header_release(buffer);
+- if (block)
+- block_Release(block);
+- atomic_fetch_sub(&sys->input_in_transit, 1);
+ vlc_sem_post(&sys->sem);
+ }
+
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++static void conv_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
+ {
+- decoder_t *dec = (decoder_t *)port->userdata;
+- decoder_sys_t *sys = dec->p_sys;
+- picture_t *picture;
+- MMAL_EVENT_FORMAT_CHANGED_T *fmt;
+- MMAL_ES_FORMAT_T *format;
+-
+- if (buffer->cmd == 0) {
+- picture = (picture_t *)buffer->user_data;
+- if (buffer->length > 0) {
+- picture->date = buffer->pts;
+- picture->b_progressive = sys->b_progressive;
+- picture->b_top_field_first = sys->b_top_field_first;
+- decoder_QueueVideo(dec, picture);
+- } else {
+- picture_Release(picture);
+- if (sys->output_pool) {
+- buffer->user_data = NULL;
+- buffer->alloc_size = 0;
+- buffer->data = NULL;
+- mmal_buffer_header_release(buffer);
+- }
+- }
+- atomic_fetch_sub(&sys->output_in_transit, 1);
+- vlc_sem_post(&sys->sem);
+- } else if (buffer->cmd == MMAL_EVENT_FORMAT_CHANGED) {
+- fmt = mmal_event_format_changed_get(buffer);
++ filter_t * const p_filter = (filter_t *)port->userdata;
++ filter_sys_t * const sys = p_filter->p_sys;
+
+- format = mmal_format_alloc();
+- mmal_format_full_copy(format, fmt->format);
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s: cmd=%d, flags=%#x, pic=%p, data=%p, len=%d/%d, pts=%lld/%lld", __func__,
++ buf->cmd, buf->flags, buf->user_data, buf->data, buf->length, buf->alloc_size,
++ (long long)buf->pts, (long long)sys->stash[(unsigned int)(buf->pts & 0xf)].pts);
++#endif
++ if (buf->cmd == 0) {
++ picture_t * const pic = (picture_t *)buf->user_data;
+
+- if (sys->opaque)
+- format->encoding = MMAL_ENCODING_OPAQUE;
++ if (pic == NULL) {
++ msg_Err(p_filter, "%s: Buffer has no attached picture", __func__);
++ }
++ else if (buf->data == NULL || buf->length == 0)
++ {
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: Buffer has no data", __func__);
++#endif
++ }
++ else
++ {
++ buf_to_pic_copy_props(pic, buf);
++
++ // Set pic data pointers from buf aux info now it has it
++ if (sys->is_cma) {
++ if (cma_pic_set_data(pic, sys->output->format, buf) != VLC_SUCCESS)
++ msg_Err(p_filter, "Failed to set data");
++ }
++
++// draw_corners(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 0, 0, pic->p[0].i_visible_pitch / 4, pic->p[0].i_visible_lines);
++#if DEBUG_SQUARES
++ draw_square(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 0, 0, 32, 32, 0xffff0000);
++ draw_square(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 32, 0, 32, 32, 0xff00ff00);
++ draw_square(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 64, 0, 32, 32, 0xff0000ff);
++#endif
++
++ buf->user_data = NULL; // Responsability for this pic no longer with buffer
++ conv_out_q_pic(sys, pic);
++ }
++ }
++
++ mmal_buffer_header_release(buf);
++}
++
++
++static void slice_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++ filter_t * const p_filter = (filter_t *)port->userdata;
++ filter_sys_t * const sys = p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s: cmd=%d, flags=%#x, pic=%p, data=%p, len=%d/%d, pts=%lld", __func__,
++ buf->cmd, buf->flags, buf->user_data, buf->data, buf->length, buf->alloc_size, (long long)buf->pts);
++#endif
++
++ if (buf->cmd != 0)
++ {
++ mmal_buffer_header_release(buf);
++ return;
++ }
++
++ if (buf->data == NULL || buf->length == 0)
++ {
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: Buffer has no data", __func__);
++#endif
++ }
++ else
++ {
++ // Got slice
++ picture_t *pic = sys->slice.pics.head;
++ const unsigned int scale_lines = sys->output->format->es->video.height; // Expected lines of callback
++
++ if (pic == NULL) {
++ msg_Err(p_filter, "No output picture");
++ goto fail;
++ }
++
++ // Copy lines
++ // * single plane only - fix for I420
++ {
++ const unsigned int scale_n = __MIN(scale_lines - sys->slice.line, MMAL_SLICE_HEIGHT);
++ const unsigned int pic_lines = pic->p[0].i_lines;
++ const unsigned int copy_n = sys->slice.line + scale_n <= pic_lines ? scale_n :
++ sys->slice.line >= pic_lines ? 0 :
++ pic_lines - sys->slice.line;
++
++ const unsigned int src_stride = buf->type->video.pitch[0];
++ const unsigned int dst_stride = pic->p[0].i_pitch;
++ uint8_t *dst = pic->p[0].p_pixels + sys->slice.line * dst_stride;
++ const uint8_t *src = buf->data + buf->type->video.offset[0];
++
++ if (src_stride == dst_stride) {
++ if (copy_n != 0)
++ memcpy(dst, src, src_stride * copy_n);
++ }
++ else {
++ unsigned int i;
++ for (i = 0; i != copy_n; ++i) {
++ memcpy(dst, src, __MIN(dst_stride, src_stride));
++ dst += dst_stride;
++ src += src_stride;
++ }
++ }
++ sys->slice.line += scale_n;
++ }
++
++ if ((buf->flags & MMAL_BUFFER_HEADER_FLAG_FRAME_END) != 0 || sys->slice.line >= scale_lines) {
++
++ if ((buf->flags & MMAL_BUFFER_HEADER_FLAG_FRAME_END) == 0 || sys->slice.line != scale_lines) {
++ // Stuff doesn't add up...
++ msg_Err(p_filter, "Line count (%d/%d) & EOF disagree (flags=%#x)", sys->slice.line, scale_lines, buf->flags);
++ goto fail;
++ }
++ else {
++ sys->slice.line = 0;
++
++ vlc_mutex_lock(&sys->lock);
++ pic_fifo_get(&sys->slice.pics); // Remove head from Q
++ vlc_mutex_unlock(&sys->lock);
++
++ buf_to_pic_copy_props(pic, buf);
++ conv_out_q_pic(sys, pic);
++ }
++ }
++ }
++
++ // Put back
++ buf->user_data = NULL; // Zap here to make sure we can't reuse later
++ mmal_buffer_header_reset(buf);
++
++ if (mmal_port_send_buffer(sys->output, buf) != MMAL_SUCCESS) {
++ mmal_buffer_header_release(buf);
++ }
++ return;
++
++fail:
++ sys->err_stream = MMAL_EIO;
++ vlc_sem_post(&sys->sem); // If we were waiting then break us out - the flush should fix sem values
++}
++
++
++static void conv_flush(filter_t * p_filter)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++ unsigned int i;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ for (i = 0; i != SUBS_MAX; ++i) {
++ hw_mmal_subpic_flush(VLC_OBJECT(p_filter), sys->subs + i);
++ }
++ }
++
++ if (sys->input != NULL && sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->output != NULL && sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++// cma_buf_pool_deletez(&sys->cma_out_pool);
++
++ // Free up anything we may have already lying around
++ // Don't need lock as the above disables should have prevented anything
++ // happening in the background
++
++ for (i = 0; i != 16; ++i) {
++ conv_frame_stash_t *const stash = sys->stash + i;
++ unsigned int sub_no;
++
++ stash->pts = MMAL_TIME_UNKNOWN;
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ if (stash->sub_bufs[sub_no] != NULL) {
++ mmal_buffer_header_release(stash->sub_bufs[sub_no]);
++ stash->sub_bufs[sub_no] = NULL;
++ }
++ }
++ }
++
++ pic_fifo_release_all(&sys->slice.pics);
++ pic_fifo_release_all(&sys->ret_pics);
++
++ // Reset sem values - easiest & most reliable way is to just kill & re-init
++ vlc_sem_destroy(&sys->sem);
++ vlc_sem_init(&sys->sem, 0);
++ sys->pic_n = 0;
++
++ // Reset error status
++ sys->err_stream = MMAL_SUCCESS;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s", __func__);
++#endif
++}
++
++static void conv_stash_fixup(filter_t * const p_filter, filter_sys_t * const sys, picture_t * const p_pic)
++{
++ conv_frame_stash_t * const stash = sys->stash + (p_pic->date & 0xf);
++ unsigned int sub_no;
++ VLC_UNUSED(p_filter);
++
++ p_pic->date = stash->pts;
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ if (stash->sub_bufs[sub_no] != NULL) {
++ // **** Do stashed blend
++ // **** Aaargh, bother... need to rescale subs too
++
++ mmal_buffer_header_release(stash->sub_bufs[sub_no]);
++ stash->sub_bufs[sub_no] = NULL;
++ }
++ }
++}
++
++// Output buffers may contain a pic ref on error or flush
++// Free it
++static MMAL_BOOL_T out_buffer_pre_release_cb(MMAL_BUFFER_HEADER_T *header, void *userdata)
++{
++ VLC_UNUSED(userdata);
++
++ picture_t * const pic = header->user_data;
++ header->user_data = NULL;
++
++ if (pic != NULL)
++ picture_Release(pic);
++
++ return MMAL_FALSE;
++}
++
++static MMAL_STATUS_T conv_set_output(filter_t * const p_filter, filter_sys_t * const sys, picture_t * const pic)
++{
++ MMAL_STATUS_T status;
++
++ sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)p_filter;
++ sys->output->format->type = MMAL_ES_TYPE_VIDEO;
++ sys->output->format->encoding = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video);
++ sys->output->format->encoding_variant = 0;
++ hw_mmal_vlc_fmt_to_mmal_fmt(sys->output->format, &p_filter->fmt_out.video);
++
++ if (pic != NULL)
++ {
++ // Override default format width/height if we have a pic we need to match
++ if ((status = pic_to_format(sys->output->format, pic)) != MMAL_SUCCESS)
++ {
++ char cbuf[5];
++ msg_Err(p_filter, "Bad format desc: %s, pic=%p, bits=%d", str_fourcc(cbuf, pic->format.i_chroma), pic, pic->format.i_bits_per_pixel);
++ return status;
++ }
++
++ MMAL_VIDEO_FORMAT_T *fmt = &sys->output->format->es->video;
++ msg_Dbg(p_filter, "%s: %dx%d [(0,0) %dx%d]", __func__, fmt->width, fmt->height, fmt->crop.width, fmt->crop.height);
++ }
++
++ if (sys->is_sliced) {
++ // Override height for slice
++ sys->output->format->es->video.height = MMAL_SLICE_HEIGHT;
++ }
++
++ mmal_log_dump_format(sys->output->format);
++
++ status = mmal_port_format_commit(sys->output);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to commit format for output port %s (status=%"PRIx32" %s)",
++ sys->output->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ sys->output->buffer_num = __MAX(sys->is_sliced ? 16 : 2, sys->output->buffer_num_recommended);
++ sys->output->buffer_size = sys->output->buffer_size_recommended;
++
++ if ((status = conv_enable_out(p_filter, sys)) != MMAL_SUCCESS)
++ return status;
++
++ return MMAL_SUCCESS;
++}
++
++
++static picture_t *conv_get_out_pics(filter_sys_t * const sys)
++{
++ picture_t * ret_pics;
++
++ vlc_sem_wait(&sys->sem);
++
++ // Return a single pending buffer
++ vlc_mutex_lock(&sys->lock);
++ ret_pics = pic_fifo_get(&sys->ret_pics);
++ vlc_mutex_unlock(&sys->lock);
++
++ return ret_pics;
++}
++
++static picture_t *conv_filter(filter_t *p_filter, picture_t *p_pic)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++ picture_t * ret_pics = NULL;
++ MMAL_STATUS_T err;
++ const uint64_t frame_seq = ++sys->frame_seq;
++ conv_frame_stash_t * const stash = sys->stash + (frame_seq & 0xf);
++ MMAL_BUFFER_HEADER_T * out_buf = NULL;
++
++#if TRACE_ALL
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(p_filter, "<<< %s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d->%s,%dx%d [(%d,%d) %dx%d] sar:%d/%d", __func__,
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma), p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ p_filter->fmt_in.video.i_sar_num, p_filter->fmt_in.video.i_sar_den,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma), p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height,
++ p_filter->fmt_out.video.i_sar_num, p_filter->fmt_out.video.i_sar_den);
++ }
++#endif
++
++ if (sys->err_stream != MMAL_SUCCESS) {
++ goto stream_fail;
++ }
++
++ // Check pic fmt corresponds to what we have set up
++ if (hw_mmal_vlc_pic_to_mmal_fmt_update(sys->input->format, p_pic))
++ {
++ msg_Dbg(p_filter, "Reset input port format");
++
++ // HVS can take new formats without disable, others need it
++ if (sys->resizer_type != FILTER_RESIZER_HVS) {
++ // Extract any pending pic
++ if (sys->pic_n >= 2) {
++ ret_pics = conv_get_out_pics(sys);
++ // If pic_n == 1 then we return without trying to get stuff
++ sys->pic_n = 1;
++ }
++ if (sys->input->is_enabled) {
++ if ((err = mmal_port_disable(sys->input)) != MMAL_SUCCESS)
++ msg_Warn(p_filter, "Format update disable failed: %s", mmal_status_to_string(err));
++ }
++ }
++
++// mmal_log_dump_port(sys->input);
++ if ((err = mmal_port_format_commit(sys->input)) != MMAL_SUCCESS)
++ msg_Warn(p_filter, "Format update commit failed: %s", mmal_status_to_string(err));
++
++ // (Re)enable if required will be done later
++ }
++
++ if (p_pic->context == NULL) {
++ // Can't have stashed subpics if not one of our pics
++ if (!sys->needs_copy_in)
++ msg_Dbg(p_filter, "%s: No context", __func__);
++ }
++ else if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ unsigned int sub_no = 0;
++
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ int rv;
++ if ((rv = hw_mmal_subpic_update(VLC_OBJECT(p_filter),
++ hw_mmal_pic_sub_buf_get(p_pic, sub_no),
++ sys->subs + sub_no,
++ &p_pic->format,
++ &sys->output->format->es->video.crop,
++ MMAL_DISPLAY_ROT0,
++ frame_seq)) == 0)
++ break;
++ else if (rv < 0)
++ goto fail;
++ }
++ }
++ else
++ {
++ unsigned int sub_no = 0;
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ if ((stash->sub_bufs[sub_no] = hw_mmal_pic_sub_buf_get(p_pic, sub_no)) != NULL) {
++ mmal_buffer_header_acquire(stash->sub_bufs[sub_no]);
++ }
++ }
++ }
++
++ if (!sys->out_fmt_set) {
++ sys->out_fmt_set = true;
++
++ if (sys->is_sliced) {
++ // If zc then we will do stride conversion when we copy to arm side
++ // so no need to worry about actual pic dimensions here
++ if ((err = conv_set_output(p_filter, sys, NULL)) != MMAL_SUCCESS)
++ goto fail;
++
++ sys->out_pool = mmal_port_pool_create(sys->output, sys->output->buffer_num, sys->output->buffer_size);
++ }
++ else {
++ picture_t *pic = filter_NewPicture(p_filter);
++ err = conv_set_output(p_filter, sys, pic);
++ picture_Release(pic);
++ if (err != MMAL_SUCCESS)
++ goto fail;
++
++ sys->out_pool = mmal_pool_create(sys->output->buffer_num, 0);
++ }
++
++ if (sys->out_pool == NULL) {
++ msg_Err(p_filter, "Failed to create output pool");
++ goto fail;
++ }
++ }
++
++ // Reenable stuff if the last thing we did was flush
++ if ((err = conv_enable_out(p_filter, sys)) != MMAL_SUCCESS ||
++ (err = conv_enable_in(p_filter, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ // We attach pic to buf before stuffing the output port
++ // We could attach the pic on output for cma, but it is a lot easier to keep
++ // the code common.
++ {
++ picture_t * const out_pic = filter_NewPicture(p_filter);
++
++ if (out_pic == NULL)
++ {
++ msg_Err(p_filter, "Failed to alloc required filter output pic");
++ goto fail;
++ }
++
++ out_pic->format.i_sar_den = p_filter->fmt_out.video.i_sar_den;
++ out_pic->format.i_sar_num = p_filter->fmt_out.video.i_sar_num;
++
++ if (sys->is_sliced) {
++ vlc_mutex_lock(&sys->lock);
++ pic_fifo_put(&sys->slice.pics, out_pic);
++ vlc_mutex_unlock(&sys->lock);
++
++ // Poke any returned pic buffers into output
++ // In general this should only happen immediately after enable
++ while ((out_buf = mmal_queue_get(sys->out_pool->queue)) != NULL)
++ mmal_port_send_buffer(sys->output, out_buf);
++ }
++ else
++ {
++ // 1 in - 1 out
++ if ((out_buf = mmal_queue_wait(sys->out_pool->queue)) == NULL)
++ {
++ msg_Err(p_filter, "Failed to get output buffer");
++ picture_Release(out_pic);
++ goto fail;
++ }
++ mmal_buffer_header_reset(out_buf);
++
++ // Attach out_pic to the buffer & ensure it is freed when the buffer is released
++ // On a good send callback the pic will be extracted to avoid this
++ out_buf->user_data = out_pic;
++ mmal_buffer_header_pre_release_cb_set(out_buf, out_buffer_pre_release_cb, NULL);
++
++#if 0
++ {
++ char dbuf0[5];
++ msg_Dbg(p_filter, "out_pic %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ out_pic->format.i_width, out_pic->format.i_height,
++ out_pic->format.i_x_offset, out_pic->format.i_y_offset,
++ out_pic->format.i_visible_width, out_pic->format.i_visible_height,
++ out_pic->format.i_sar_num, out_pic->format.i_sar_den);
++ }
++#endif
++
++ if (sys->is_cma) {
++ int rv;
++
++ cma_buf_t * const cb = cma_buf_pool_alloc_buf(sys->cma_out_pool, sys->output->buffer_size);
++ if (cb == NULL) {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to alloc CMA buf: fmt=%s, size=%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ sys->output->buffer_size);
++ goto fail;
++ }
++ const unsigned int vc_h = cma_buf_vc_handle(cb); // Cannot coerce without going via variable
++ out_buf->data = (uint8_t *)vc_h;
++ out_buf->alloc_size = sys->output->buffer_size;
++
++ if ((rv = cma_buf_pic_attach(cb, out_pic)) != VLC_SUCCESS)
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to attach CMA to pic: fmt=%s err=%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ rv);
++ cma_buf_unref(cb);
++ goto fail;
++ }
++ }
++ else {
++ out_buf->data = out_pic->p[0].p_pixels;
++ out_buf->alloc_size = out_pic->p[0].i_pitch * out_pic->p[0].i_lines;
++ //**** stride ????
++ }
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Out buf send: pic=%p, data=%p, user=%p, flags=%#x, len=%d/%d, pts=%lld",
++ p_pic, out_buf->data, out_buf->user_data, out_buf->flags,
++ out_buf->length, out_buf->alloc_size, (long long)out_buf->pts);
++#endif
++
++ if ((err = mmal_port_send_buffer(sys->output, out_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to output failed");
++ goto fail;
++ }
++ out_buf = NULL;
++ }
++ }
++
++
++ // Stuff into input
++ // We assume the BH is already set up with values reflecting pic date etc.
++ stash->pts = p_pic->date;
++ {
++ MMAL_BUFFER_HEADER_T *const pic_buf = sys->needs_copy_in ?
++ hw_mmal_pic_buf_copied(p_pic, sys->in_pool, sys->input, sys->cma_in_pool) :
++ hw_mmal_pic_buf_replicated(p_pic, sys->in_pool);
++
++ // Whether or not we extracted the pic_buf we are done with the picture
++ picture_Release(p_pic);
++ p_pic = NULL;
++
++ if (pic_buf == NULL) {
++ msg_Err(p_filter, "Pic has no attached buffer");
++ goto fail;
++ }
++
++ pic_buf->pts = frame_seq;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "In buf send: pic=%p, data=%p, user=%p, flags=%#x, len=%d/%d/%d, pts=%lld",
++ p_pic, pic_buf->data, pic_buf->user_data, pic_buf->flags,
++ pic_buf->length, pic_buf->alloc_size, sys->input->buffer_size, (long long)pic_buf->pts);
++#endif
++
++ if ((err = mmal_port_send_buffer(sys->input, pic_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to input failed");
++ mmal_buffer_header_release(pic_buf);
++ goto fail;
++ }
++ }
++
++ // We have a 1 pic latency for everything except the 1st pic which we
++ // wait for.
++ // This means we get a single static pic out
++ if (sys->pic_n++ == 1) {
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: Pic1=%p", __func__, ret_pics);
++#endif
++ return ret_pics;
++ }
++
++ ret_pics = conv_get_out_pics(sys);
++
++ if (sys->err_stream != MMAL_SUCCESS)
++ goto stream_fail;
++
++ conv_stash_fixup(p_filter, sys, ret_pics);
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: pic=%p", __func__, ret_pics);
++#endif
++
++ return ret_pics;
++
++stream_fail:
++ msg_Err(p_filter, "MMAL error reported by callback");
++fail:
++#if TRACE_ALL
++ msg_Err(p_filter, ">>> %s: FAIL", __func__);
++#endif
++ if (ret_pics != NULL)
++ picture_Release(ret_pics);
++ if (out_buf != NULL)
++ mmal_buffer_header_release(out_buf);
++ if (p_pic != NULL)
++ picture_Release(p_pic);
++ conv_flush(p_filter);
++ return NULL;
++}
++
++static void CloseConverter(vlc_object_t * obj)
++{
++ filter_t * const p_filter = (filter_t *)obj;
++ filter_sys_t * const sys = p_filter->p_sys;
++ unsigned int i;
++
++#if TRACE_ALL
++ msg_Dbg(obj, "<<< %s", __func__);
++#endif
++
++ if (sys == NULL)
++ return;
++
++ // Disables input & output ports
++ conv_flush(p_filter);
++
++ cma_buf_pool_deletez(&sys->cma_in_pool);
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++
++ if (sys->component && sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
++
++ if (sys->component && sys->component->is_enabled)
++ mmal_component_disable(sys->component);
++
++ if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ for (i = 0; i != SUBS_MAX; ++i) {
++ hw_mmal_subpic_close(VLC_OBJECT(p_filter), sys->subs + i);
++ }
++ }
++
++ if (sys->out_pool)
++ {
++ if (sys->is_sliced)
++ mmal_port_pool_destroy(sys->output, sys->out_pool);
++ else
++ mmal_pool_destroy(sys->out_pool);
++ }
++
++ if (sys->in_pool != NULL)
++ mmal_pool_destroy(sys->in_pool);
++
++ if (sys->component)
++ mmal_component_release(sys->component);
++
++ cma_vcsm_exit(sys->vcsm_init_type);
++
++ vlc_sem_destroy(&sys->sem);
++ vlc_mutex_destroy(&sys->lock);
++
++ p_filter->p_sys = NULL;
++ free(sys);
++}
++
++
++static inline MMAL_FOURCC_T filter_enc_in(const video_format_t * const fmt)
++{
++ if (hw_mmal_chroma_is_mmal(fmt->i_chroma))
++ return vlc_to_mmal_video_fourcc(fmt);
++
++ if (fmt->i_chroma == VLC_CODEC_I420 ||
++ fmt->i_chroma == VLC_CODEC_I420_10L)
++ return MMAL_ENCODING_I420;
++
++ return 0;
++}
++
++static inline MMAL_FOURCC_T filter_enc_out(const video_format_t * const fmt)
++{
++ const MMAL_FOURCC_T mmes = vlc_to_mmal_video_fourcc(fmt);
++ // Can only copy out single plane stuff currently - this could be fixed!
++ return hw_mmal_chroma_is_mmal(fmt->i_chroma) || mmes != MMAL_ENCODING_I420 ? mmes : 0;
++}
++
++
++static int OpenConverter(vlc_object_t * obj)
++{
++ filter_t * const p_filter = (filter_t *)obj;
++ int ret = VLC_EGENERIC;
++ filter_sys_t *sys;
++ MMAL_STATUS_T status;
++ MMAL_FOURCC_T enc_out = filter_enc_out(&p_filter->fmt_out.video);
++ const MMAL_FOURCC_T enc_in = filter_enc_in(&p_filter->fmt_in.video);
++ bool use_resizer;
++ bool use_isp;
++ int gpu_mem;
++
++ // At least in principle we should deal with any mmal format as input
++ if (enc_in == 0 || enc_out == 0)
++ return VLC_EGENERIC;
++
++ // Can't transform
++ if (p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation)
++ return VLC_EGENERIC;
++
++ use_resizer = var_InheritBool(p_filter, MMAL_RESIZE_NAME);
++ use_isp = var_InheritBool(p_filter, MMAL_ISP_NAME);
++
++retry:
++ // ** Make more generic by checking supported encs
++ //
++ // Must use ISP - HVS can't do this, nor can resizer
++ if (enc_in == MMAL_ENCODING_YUVUV64_10) {
++ // If resizer selected then just give up
++ if (use_resizer)
++ return VLC_EGENERIC;
++ // otherwise downgrade HVS to ISP
++ use_isp = true;
++ }
++ // HVS can't do I420
++ if (enc_out == MMAL_ENCODING_I420) {
++ use_isp = true;
++ }
++ // Only HVS can deal with SAND30
++ if (enc_in == MMAL_ENCODING_YUV10_COL) {
++ if (use_isp || use_resizer)
++ return VLC_EGENERIC;
++ }
+
+- sys->output_format = format;
+
+- mmal_buffer_header_release(buffer);
++ if (use_resizer) {
++ // use resizer overrides use_isp
++ use_isp = false;
++ }
++
++ // Check we have a sliced version of the fourcc if we want the resizer
++ if (use_resizer &&
++ (enc_out = pic_to_slice_mmal_fourcc(enc_out)) == 0) {
++ return VLC_EGENERIC;
++ }
++
++ gpu_mem = hw_mmal_get_gpu_mem();
++
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5], dbuf3[5];
++ msg_Dbg(p_filter, "%s: (%s) %s/%s,%dx%d [(%d,%d) %d/%d] sar:%d/%d->%s/%s,%dx%d [(%d,%d) %dx%d] rgb:%#x:%#x:%#x sar:%d/%d (gpu=%d)", __func__,
++ use_resizer ? "resize" : use_isp ? "isp" : "hvs",
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma), str_fourcc(dbuf2, enc_in),
++ p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ p_filter->fmt_in.video.i_sar_num, p_filter->fmt_in.video.i_sar_den,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma), str_fourcc(dbuf3, enc_out),
++ p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height,
++ p_filter->fmt_out.video.i_rmask, p_filter->fmt_out.video.i_gmask, p_filter->fmt_out.video.i_bmask,
++ p_filter->fmt_out.video.i_sar_num, p_filter->fmt_out.video.i_sar_den,
++ gpu_mem);
++ }
++
++ sys = calloc(1, sizeof(filter_sys_t));
++ if (!sys) {
++ ret = VLC_ENOMEM;
++ goto fail;
++ }
++ p_filter->p_sys = sys;
++
++ // Init stuff the we destroy unconditionaly in Close first
++ vlc_mutex_init(&sys->lock);
++ vlc_sem_init(&sys->sem, 0);
++ sys->err_stream = MMAL_SUCCESS;
++ pic_fifo_init(&sys->ret_pics);
++ pic_fifo_init(&sys->slice.pics);
++
++ sys->needs_copy_in = !hw_mmal_chroma_is_mmal(p_filter->fmt_in.video.i_chroma);
++ sys->in_port_cb_fn = conv_input_port_cb;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(p_filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if (use_resizer) {
++ sys->resizer_type = FILTER_RESIZER_RESIZER;
++ sys->is_sliced = true;
++ sys->component_name = MMAL_COMPONENT_DEFAULT_RESIZER;
++ sys->out_port_cb_fn = slice_output_port_cb;
++ }
++ else if (use_isp) {
++ sys->resizer_type = FILTER_RESIZER_ISP;
++ sys->is_sliced = false; // Copy directly into filter picture
++ sys->component_name = MMAL_COMPONENT_ISP_RESIZER;
++ sys->out_port_cb_fn = conv_output_port_cb;
+ } else {
+- mmal_buffer_header_release(buffer);
++ sys->resizer_type = FILTER_RESIZER_HVS;
++ sys->is_sliced = false; // Copy directly into filter picture
++ sys->component_name = MMAL_COMPONENT_HVS;
++ sys->out_port_cb_fn = conv_output_port_cb;
++ }
++ sys->is_cma = is_cma_buf_pic_chroma(p_filter->fmt_out.video.i_chroma);
++
++ status = mmal_component_create(sys->component_name, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ if (!use_isp && !use_resizer) {
++ msg_Warn(p_filter, "Failed to rcreate HVS resizer - retrying with ISP");
++ CloseConverter(obj);
++ use_isp = true;
++ goto retry;
++ }
++ msg_Err(p_filter, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, status, mmal_status_to_string(status));
++ goto fail;
+ }
++ sys->output = sys->component->output[0];
++ sys->input = sys->component->input[0];
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)p_filter;
++ status = mmal_port_enable(sys->component->control, conv_control_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if (sys->needs_copy_in &&
++ (sys->cma_in_pool = cma_buf_pool_new(2, 2, true, "conv-copy-in")) == NULL)
++ {
++ msg_Err(p_filter, "Failed to allocate input CMA pool");
++ goto fail;
++ }
++
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)p_filter;
++ sys->input->format->type = MMAL_ES_TYPE_VIDEO;
++ sys->input->format->encoding = enc_in;
++ sys->input->format->encoding_variant = MMAL_ENCODING_I420;
++ hw_mmal_vlc_fmt_to_mmal_fmt(sys->input->format, &p_filter->fmt_in.video);
++ port_parameter_set_bool(sys->input, MMAL_PARAMETER_ZERO_COPY, 1);
++
++ mmal_log_dump_format(sys->input->format);
++
++ status = mmal_port_format_commit(sys->input);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ sys->input->buffer_num = NUM_DECODER_BUFFER_HEADERS;
++
++ if ((status = conv_enable_in(p_filter, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ port_parameter_set_bool(sys->output, MMAL_PARAMETER_ZERO_COPY, sys->is_sliced || sys->is_cma);
++
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((sys->in_pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(p_filter, "Failed to create input pool");
++ goto fail;
++ }
++
++ if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ unsigned int i;
++ for (i = 0; i != SUBS_MAX; ++i) {
++ if (hw_mmal_subpic_open(VLC_OBJECT(p_filter), sys->subs + i, sys->component->input[i + 1], -1, i + 1) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to open subpic %d", i);
++ goto fail;
++ }
++ }
++ }
++
++ p_filter->pf_video_filter = conv_filter;
++ p_filter->pf_flush = conv_flush;
++ // video_drain NIF in filter structure
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: ok", __func__);
++#endif
++
++ return VLC_SUCCESS;
++
++fail:
++ CloseConverter(obj);
++
++ if (!use_resizer && status == MMAL_ENOMEM) {
++ use_resizer = true;
++ msg_Warn(p_filter, "Lack of memory to use HVS/ISP: trying resizer");
++ goto retry;
++ }
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: FAIL: %d", __func__, ret);
++#endif
++ return ret;
++}
++
++#if OPT_TO_FROM_ZC
++//----------------------------------------------------------------------------
++//
++// Simple copy in to ZC
++
++typedef struct to_zc_sys_s {
++ vcsm_init_type_t vcsm_init_type;
++ cma_buf_pool_t * cma_out_pool;
++} to_zc_sys_t;
++
++
++static size_t buf_alloc_size(const vlc_fourcc_t i_chroma, const unsigned int width, const unsigned int height)
++{
++ const unsigned int pels = width * height;
++
++ switch (i_chroma)
++ {
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ return pels * 4;
++ case VLC_CODEC_MMAL_ZC_I420:
++ return pels * 3 / 2;
++ default:
++ break;
++ }
++ return 0;
++}
++
++
++static picture_t *
++to_zc_filter(filter_t *p_filter, picture_t *in_pic)
++{
++ to_zc_sys_t * const sys = (to_zc_sys_t *)p_filter->p_sys;
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ assert(p_filter->fmt_out.video.i_chroma == VLC_CODEC_MMAL_ZC_I420);
++
++ picture_t * const out_pic = filter_NewPicture(p_filter);
++ if (out_pic == NULL)
++ goto fail0;
++
++ MMAL_ES_SPECIFIC_FORMAT_T mm_vfmt = {.video={0}};
++ MMAL_ES_FORMAT_T mm_esfmt = {
++ .encoding = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video),
++ .es = &mm_vfmt};
++
++ hw_mmal_vlc_fmt_to_mmal_fmt(&mm_esfmt, &p_filter->fmt_out.video);
++
++ const size_t buf_alloc = buf_alloc_size(p_filter->fmt_out.video.i_chroma,
++ mm_vfmt.video.width, mm_vfmt.video.height);
++ if (buf_alloc == 0)
++ goto fail1;
++ cma_buf_t *const cb = cma_buf_pool_alloc_buf(sys->cma_out_pool, buf_alloc);
++ if (cb == NULL)
++ goto fail1;
++
++ if (cma_buf_pic_attach(cb, out_pic) != VLC_SUCCESS)
++ goto fail2;
++ cma_pic_set_data(out_pic, &mm_esfmt, NULL);
++
++ hw_mmal_copy_pic_to_buf(cma_buf_addr(cb), NULL, &mm_esfmt, in_pic);
++
++ // Copy pic properties
++ out_pic->date = in_pic->date;
++ out_pic->b_force = in_pic->b_force;
++ out_pic->b_progressive = in_pic->b_progressive;
++ out_pic->b_top_field_first = in_pic->b_top_field_first;
++ out_pic->i_nb_fields = in_pic->i_nb_fields;
++
++ picture_Release(in_pic);
++
++ return out_pic;
++
++fail2:
++ cma_buf_unref(cb);
++fail1:
++ picture_Release(out_pic);
++fail0:
++ picture_Release(in_pic);
++ return NULL;
++}
++
++static void to_zc_flush(filter_t * p_filter)
++{
++ VLC_UNUSED(p_filter);
+ }
++
++static void CloseConverterToZc(vlc_object_t * obj)
++{
++ filter_t * const p_filter = (filter_t *)obj;
++ to_zc_sys_t * const sys = (to_zc_sys_t *)p_filter->p_sys;
++
++ if (sys == NULL)
++ return;
++
++ p_filter->p_sys = NULL;
++
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++ cma_vcsm_exit(sys->vcsm_init_type);
++
++ free(sys);
++}
++
++static bool to_zc_validate_fmt(const video_format_t * const f_in, const video_format_t * const f_out)
++{
++ if (!((f_in->i_chroma == VLC_CODEC_I420 || f_in->i_chroma == VLC_CODEC_I420_10L) &&
++ f_out->i_chroma == VLC_CODEC_MMAL_ZC_I420))
++ {
++ return false;
++ }
++ if (f_in->i_height != f_out->i_height ||
++ f_in->i_width != f_out->i_width)
++ {
++ return false;
++ }
++
++ return true;
++}
++
++static int OpenConverterToZc(vlc_object_t * obj)
++{
++ int ret = VLC_EGENERIC;
++ filter_t * const p_filter = (filter_t *)obj;
++
++ if (!to_zc_validate_fmt(&p_filter->fmt_in.video, &p_filter->fmt_out.video))
++ goto fail;
++
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(p_filter, "%s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d->%s,%dx%d [(%d,%d) %dx%d] rgb:%#x:%#x:%#x sar:%d/%d", __func__,
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma),
++ p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ p_filter->fmt_in.video.i_sar_num, p_filter->fmt_in.video.i_sar_den,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma),
++ p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height,
++ p_filter->fmt_out.video.i_rmask, p_filter->fmt_out.video.i_gmask, p_filter->fmt_out.video.i_bmask,
++ p_filter->fmt_out.video.i_sar_num, p_filter->fmt_out.video.i_sar_den);
++ }
++
++ to_zc_sys_t * const sys = calloc(1, sizeof(*sys));
++ if (!sys) {
++ ret = VLC_ENOMEM;
++ goto fail;
++ }
++ p_filter->p_sys = (filter_sys_t *)sys;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(p_filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if ((sys->cma_out_pool = cma_buf_pool_new(5, 5, true, "conv-to-zc")) == NULL)
++ {
++ msg_Err(p_filter, "Failed to allocate input CMA pool");
++ goto fail;
++ }
++
++ p_filter->pf_video_filter = to_zc_filter;
++ p_filter->pf_flush = to_zc_flush;
++ return VLC_SUCCESS;
++
++fail:
++ CloseConverterToZc(obj);
++ return ret;
++}
++
++//----------------------------------------------------------------------------
++//
++// Simple "copy" from ZC
++
++static void CloseConverterFromZc(vlc_object_t * obj)
++{
++ VLC_UNUSED(obj);
++}
++
++static int OpenConverterFromZc(vlc_object_t * obj)
++{
++ return VLC_EGENERIC;
++}
++#endif
++//----------------------------------------------------------------------------
++
++typedef struct blend_sys_s {
++ vzc_pool_ctl_t * vzc;
++ const picture_t * last_dst; // Not a ref, just a hint that we have a new pic
++ vcsm_init_type_t vcsm_init_type;
++} blend_sys_t;
++
++static void FilterBlendMmal(filter_t *p_filter,
++ picture_t *dst, const picture_t * src,
++ int x_offset, int y_offset, int alpha)
++{
++ blend_sys_t * const sys = (blend_sys_t *)p_filter->p_sys;
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s (%d,%d:%d) pic=%p, pts=%lld, force=%d", __func__, x_offset, y_offset, alpha, src, src->date, src->b_force);
++#endif
++ // If nothing to do then do nothing
++ if (alpha == 0 ||
++ src->format.i_visible_height == 0 ||
++ src->format.i_visible_width == 0)
++ {
++ return;
++ }
++
++ if (dst->context == NULL)
++ msg_Err(p_filter, "MMAL pic missing context");
++ else
++ {
++ // cast away src const so we can ref it
++ MMAL_BUFFER_HEADER_T *buf = hw_mmal_vzc_buf_from_pic(sys->vzc, (picture_t *)src,
++ vis_mmal_rect(&dst->format),
++ x_offset, y_offset,
++ alpha,
++ dst != sys->last_dst || !hw_mmal_pic_has_sub_bufs(dst));
++ if (buf == NULL) {
++ msg_Err(p_filter, "Failed to allocate vzc buffer for subpic");
++ return;
++ }
++
++ hw_mmal_pic_sub_buf_add(dst, buf);
++
++ sys->last_dst = dst;
++ }
++}
++
++static void FlushBlendMmal(filter_t * p_filter)
++{
++ blend_sys_t * const sys = (blend_sys_t *)p_filter->p_sys;
++ sys->last_dst = NULL;
++ hw_mmal_vzc_pool_flush(sys->vzc);
++}
++
++static void CloseBlendMmal(vlc_object_t *object)
++{
++ filter_t * const p_filter = (filter_t *)object;
++ blend_sys_t * const sys = (blend_sys_t *)p_filter->p_sys;
++
++ if (sys != NULL) {
++ p_filter->p_sys = NULL;
++
++ hw_mmal_vzc_pool_release(sys->vzc);
++ cma_vcsm_exit(sys->vcsm_init_type);
++ free(sys);
++ }
++}
++
++static int OpenBlendMmal(vlc_object_t *object)
++{
++ filter_t * const p_filter = (filter_t *)object;
++ const vlc_fourcc_t vfcc_dst = p_filter->fmt_out.video.i_chroma;
++
++ if (!hw_mmal_chroma_is_mmal(vfcc_dst) ||
++ !hw_mmal_vzc_subpic_fmt_valid(&p_filter->fmt_in.video))
++ {
++ return VLC_EGENERIC;
++ }
++
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(p_filter, "%s: (%s) %s,%dx%d [(%d,%d) %dx%d]->%s,%dx%d [(%d,%d) %dx%d]", __func__,
++ "blend",
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma), p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma), p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height);
++ }
++
++ {
++ blend_sys_t * const sys = calloc(1, sizeof (*sys));
++ if (sys == NULL)
++ return VLC_ENOMEM;
++
++ p_filter->p_sys = (filter_sys_t *)sys;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(p_filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if ((sys->vzc = hw_mmal_vzc_pool_new()) == NULL)
++ goto fail;
++ }
++
++ p_filter->pf_video_blend = FilterBlendMmal;
++ p_filter->pf_flush = FlushBlendMmal;
++
++ return VLC_SUCCESS;
++
++fail:
++ CloseBlendMmal(VLC_OBJECT(p_filter));
++ return VLC_ENOMEM;
++}
++
++// ---------------------------------------------------------------------------
++
++static void FilterBlendNeon(filter_t *p_filter,
++ picture_t *dst_pic, const picture_t * src_pic,
++ int x_offset, int y_offset, int alpha)
++{
++ const uint8_t * s_data;
++ uint8_t * d_data;
++ int width = src_pic->format.i_visible_width;
++ int height = src_pic->format.i_visible_height;
++ blend_neon_fn *const blend_fn = (blend_neon_fn * )p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s (%d,%d:%d) pic=%p, pts=%lld, force=%d", __func__, x_offset, y_offset, alpha, src_pic, src_pic->date, src_pic->b_force);
++#endif
++
++ if (alpha == 0 ||
++ src_pic->format.i_visible_height == 0 ||
++ src_pic->format.i_visible_width == 0)
++ {
++ return;
++ }
++
++ x_offset += dst_pic->format.i_x_offset;
++ y_offset += dst_pic->format.i_y_offset;
++
++ // Deal with R/B overrun
++ if (x_offset + width >= (int)(dst_pic->format.i_x_offset + dst_pic->format.i_visible_width))
++ width = dst_pic->format.i_x_offset + dst_pic->format.i_visible_width - x_offset;
++ if (y_offset + height >= (int)(dst_pic->format.i_y_offset + dst_pic->format.i_visible_height))
++ height = dst_pic->format.i_y_offset + dst_pic->format.i_visible_height - y_offset;
++
++ if (width <= 0 || height <= 0) {
++ return;
++ }
++
++ // *** L/U overrun
++
++ s_data = src_pic->p[0].p_pixels +
++ src_pic->p[0].i_pixel_pitch * src_pic->format.i_x_offset +
++ src_pic->p[0].i_pitch * src_pic->format.i_y_offset;
++ d_data = dst_pic->p[0].p_pixels +
++ dst_pic->p[0].i_pixel_pitch * x_offset +
++ dst_pic->p[0].i_pitch * y_offset;
++
++
++ do {
++ blend_fn(d_data, s_data, alpha, width);
++ s_data += src_pic->p[0].i_pitch;
++ d_data += dst_pic->p[0].i_pitch;
++ } while (--height > 0);
++}
++
++static void CloseBlendNeon(vlc_object_t *object)
++{
++ VLC_UNUSED(object);
++}
++
++static int OpenBlendNeon(vlc_object_t *object)
++{
++ filter_t * const p_filter = (filter_t *)object;
++ const vlc_fourcc_t vfcc_dst = p_filter->fmt_out.video.i_chroma;
++ MMAL_FOURCC_T mfcc_src = vlc_to_mmal_video_fourcc(&p_filter->fmt_in.video);
++ MMAL_FOURCC_T mfcc_dst = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video);
++ blend_neon_fn * blend_fn = (blend_neon_fn *)0;
++
++ // Non-alpha RGB only for dest
++ if (vfcc_dst != VLC_CODEC_RGB32)
++ return VLC_EGENERIC;
++
++ // Check we have appropriate blend fn (mmal doesn't have a non-alpha RGB32)
++ switch (mfcc_src) {
++ case MMAL_ENCODING_RGBA:
++ if (mfcc_dst == MMAL_ENCODING_RGBA)
++ blend_fn = blend_rgbx_rgba_neon;
++ else if (mfcc_dst == MMAL_ENCODING_BGRA)
++ blend_fn = blend_bgrx_rgba_neon;
++ break;
++
++ case MMAL_ENCODING_BGRA:
++ if (mfcc_dst == MMAL_ENCODING_BGRA)
++ blend_fn = blend_rgbx_rgba_neon;
++ else if (mfcc_dst == MMAL_ENCODING_RGBA)
++ blend_fn = blend_bgrx_rgba_neon;
++ break;
++
++ default:
++ break;
++ }
++
++ if (blend_fn == (blend_neon_fn *)0)
++ {
++ return VLC_EGENERIC;
++ }
++
++ p_filter->p_sys = (void *)blend_fn;
++ p_filter->pf_video_blend = FilterBlendNeon;
++
++ {
++ char dbuf0[5], dbuf1[5];
++ char dbuf0a[5], dbuf1a[5];
++ msg_Dbg(p_filter, "%s: (%s) %s/%s,%dx%d [(%d,%d) %dx%d]->%s/%s,%dx%d [(%d,%d) %dx%d]", __func__,
++ "blend",
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma),
++ str_fourcc(dbuf0a, mfcc_src),
++ p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma),
++ str_fourcc(dbuf1a, mfcc_dst),
++ p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height);
++ }
++
++ return VLC_SUCCESS;
++}
++
++vlc_module_begin()
++ set_category( CAT_INPUT )
++ set_subcategory( SUBCAT_INPUT_VCODEC )
++ set_shortname(N_("MMAL decoder"))
++ set_description(N_("MMAL-based decoder plugin for Raspberry Pi"))
++ set_capability("video decoder", 90)
++ add_shortcut("mmal_decoder")
++ add_bool(MMAL_OPAQUE_NAME, true, MMAL_OPAQUE_TEXT, MMAL_OPAQUE_LONGTEXT, false)
++ set_callbacks(OpenDecoder, CloseDecoder)
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_shortname(N_("MMAL resizer"))
++ set_description(N_("MMAL resizing conversion filter"))
++ add_shortcut("mmal_converter")
++ set_capability( "video converter", 900 )
++ add_bool(MMAL_RESIZE_NAME, /* default */ false, MMAL_RESIZE_TEXT, MMAL_RESIZE_LONGTEXT, /* advanced option */ false)
++ add_bool(MMAL_ISP_NAME, /* default */ false, MMAL_ISP_TEXT, MMAL_ISP_LONGTEXT, /* advanced option */ false)
++ set_callbacks(OpenConverter, CloseConverter)
++
++#if OPT_TO_FROM_ZC
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_shortname(N_("MMAL to ZC"))
++ set_description(N_("MMAL conversion to ZC filter"))
++ add_shortcut("mmal_to_zc")
++ set_capability( "video converter", 901 )
++ set_callbacks(OpenConverterToZc, CloseConverterToZc)
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_shortname(N_("MMAL from ZC"))
++ set_description(N_("MMAL conversion from ZC filter"))
++ add_shortcut("mmal_from_zc")
++ set_capability( "video converter", 902 )
++ set_callbacks(OpenConverterFromZc, CloseConverterFromZc)
++#endif
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_description(N_("Video pictures blending for MMAL"))
++ add_shortcut("mmal_blend")
++ set_capability("video blending", 120)
++ set_callbacks(OpenBlendMmal, CloseBlendMmal)
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_description(N_("Video pictures blending for neon"))
++ add_shortcut("neon_blend")
++ set_capability("video blending", 110)
++ set_callbacks(OpenBlendNeon, CloseBlendNeon)
++
++vlc_module_end()
++
++
+--- /dev/null
++++ b/modules/hw/mmal/converter_mmal.c
+@@ -0,0 +1,479 @@
++#ifdef HAVE_CONFIG_H
++# include "config.h"
++#endif
++
++#include <unistd.h>
++#include <fcntl.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include <vlc_common.h>
++#include <vlc_picture.h>
++
++#include <libdrm/drm_fourcc.h>
++#include <EGL/egl.h>
++#include <EGL/eglext.h>
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
++
++#include "mmal_cma.h"
++
++#include "../../video_output/opengl/converter.h"
++
++#include "mmal_picture.h"
++
++#include <assert.h>
++
++#define TRACE_ALL 0
++
++typedef struct mmal_gl_converter_s
++{
++ EGLint drm_fourcc;
++ vcsm_init_type_t vcsm_init_type;
++ cma_buf_t * last_cb;
++
++ PFNGLEGLIMAGETARGETTEXTURE2DOESPROC glEGLImageTargetTexture2DOES;
++} mmal_gl_converter_t;
++
++
++static EGLint vlc_to_gl_fourcc(const video_format_t * const fmt)
++{
++ // Converting to mmal selects the right RGB32 varient
++ switch(vlc_to_mmal_video_fourcc(fmt))
++ {
++ case MMAL_ENCODING_I420:
++ return MMAL_FOURCC('Y','U','1','2');
++ case MMAL_ENCODING_YV12:
++ return MMAL_FOURCC('Y','V','1','2');
++ case MMAL_ENCODING_I422:
++ return MMAL_FOURCC('Y','U','1','6');
++// case MMAL_ENCODING_YUVUV128: // Doesn't actually work yet
++ case MMAL_ENCODING_NV12:
++ return MMAL_FOURCC('N','V','1','2');
++ case MMAL_ENCODING_NV21:
++ return MMAL_FOURCC('N','V','2','1');
++ case MMAL_ENCODING_RGB16:
++ return MMAL_FOURCC('R','G','1','6');
++ case MMAL_ENCODING_RGB24:
++ return MMAL_FOURCC('B','G','2','4');
++ case MMAL_ENCODING_BGR24:
++ return MMAL_FOURCC('R','G','2','4');
++ case MMAL_ENCODING_BGR32:
++ case MMAL_ENCODING_BGRA:
++ return MMAL_FOURCC('X','R','2','4');
++ case MMAL_ENCODING_RGB32:
++ case MMAL_ENCODING_RGBA:
++ return MMAL_FOURCC('X','B','2','4');
++ default:
++ break;
++ }
++ return 0;
++}
++
++typedef struct tex_context_s {
++ picture_context_t cmn;
++ GLuint texture;
++
++ PFNGLDELETETEXTURESPROC DeleteTextures; // Copy fn pointer so we don't need tc on delete
++} tex_context_t;
++
++static void tex_context_delete(tex_context_t * const tex)
++{
++ tex->DeleteTextures(1, &tex->texture);
++ free(tex);
++}
++
++static void tex_context_destroy(picture_context_t * pic_ctx)
++{
++ tex_context_delete((tex_context_t *)pic_ctx);
++}
++
++static picture_context_t * tex_context_copy(picture_context_t * pic_ctx)
++{
++ return pic_ctx;
++}
++
++static tex_context_t * get_tex_context(const opengl_tex_converter_t * const tc, picture_t * const pic, cma_buf_t * const cb)
++{
++ mmal_gl_converter_t * const sys = tc->priv;
++ tex_context_t * tex = (tex_context_t *)cma_buf_context2(cb);
++ if (tex != NULL)
++ return tex;
++
++ if ((tex = malloc(sizeof(*tex))) == NULL)
++ return NULL;
++
++ *tex = (tex_context_t){
++ .cmn = {
++ .destroy = tex_context_destroy,
++ .copy = tex_context_copy
++ },
++ .texture = 0,
++ .DeleteTextures = tc->vt->DeleteTextures
++ };
++
++ {
++ EGLint attribs[30];
++ EGLint * a = attribs;
++ const int fd = cma_buf_fd(cb);
++ uint8_t * base_addr = cma_buf_addr(cb);
++
++ if (pic->i_planes >= 4 || pic->i_planes <= 0)
++ {
++ msg_Err(tc, "%s: Bad planes: %d", __func__, pic->i_planes);
++ goto fail;
++ }
++
++ *a++ = EGL_WIDTH;
++ *a++ = pic->format.i_visible_width;
++ *a++ = EGL_HEIGHT;
++ *a++ = pic->format.i_visible_height;
++ *a++ = EGL_LINUX_DRM_FOURCC_EXT;
++ *a++ = sys->drm_fourcc;
++
++ if (pic->format.i_chroma == VLC_CODEC_MMAL_ZC_SAND8)
++ {
++ // Sand is its own very special bunny :-(
++ static const EGLint attnames[] = {
++ EGL_DMA_BUF_PLANE0_FD_EXT,
++ EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE0_PITCH_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE1_FD_EXT,
++ EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE1_PITCH_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT
++ };
++
++ const EGLint * n = attnames;
++
++ for (int i = 0; i < pic->i_planes; ++i)
++ {
++ const uint64_t mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(pic->p[i].i_pitch >> 7);
++
++ *a++ = *n++;
++ *a++ = fd;
++ *a++ = *n++;
++ *a++ = pic->p[i].p_pixels - base_addr;
++ *a++ = *n++;
++ *a++ = pic->format.i_width;
++ *a++ = *n++;
++ *a++ = (EGLint)(mod >> 32);
++ *a++ = *n++;
++ *a++ = (EGLint)(mod & 0xffffffff);
++ }
++ }
++ else
++ {
++ static const EGLint attnames[] = {
++ EGL_DMA_BUF_PLANE0_FD_EXT,
++ EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE0_PITCH_EXT,
++ EGL_DMA_BUF_PLANE1_FD_EXT,
++ EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE1_PITCH_EXT,
++ EGL_DMA_BUF_PLANE2_FD_EXT,
++ EGL_DMA_BUF_PLANE2_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE2_PITCH_EXT,
++ EGL_DMA_BUF_PLANE3_FD_EXT,
++ EGL_DMA_BUF_PLANE3_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE3_PITCH_EXT
++ };
++
++ const EGLint * n = attnames;
++
++ for (int i = 0; i < pic->i_planes; ++i)
++ {
++ *a++ = *n++;
++ *a++ = fd;
++ *a++ = *n++;
++ *a++ = pic->p[i].p_pixels - base_addr;
++ *a++ = *n++;
++ *a++ = pic->p[i].i_pitch;
++ }
++ }
++
++ *a = EGL_NONE;
++
++ const EGLImage image = tc->gl->egl.createImageKHR(tc->gl, EGL_LINUX_DMA_BUF_EXT, NULL, attribs);
++ if (!image) {
++ msg_Err(tc, "Failed to import fd %d: Err=%#x", fd, tc->vt->GetError());
++ goto fail;
++ }
++
++ // ** ?? tc->tex_target
++ tc->vt->GenTextures(1, &tex->texture);
++ tc->vt->BindTexture(GL_TEXTURE_EXTERNAL_OES, tex->texture);
++ tc->vt->TexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
++ tc->vt->TexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
++ sys->glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
++
++ tc->gl->egl.destroyImageKHR(tc->gl, image);
++ }
++
++ if (cma_buf_add_context2(cb, &tex->cmn) != VLC_SUCCESS)
++ {
++ msg_Err(tc, "%s: add_context2 failed", __func__);
++ goto fail;
++ }
++ return tex;
++
++fail:
++ tex_context_delete(tex);
++ return NULL;
++}
++
++
++static int
++tc_mmal_update(const opengl_tex_converter_t *tc, GLuint *textures,
++ const GLsizei *tex_width, const GLsizei *tex_height,
++ picture_t *pic, const size_t *plane_offset)
++{
++ mmal_gl_converter_t * const sys = tc->priv;
++#if TRACE_ALL
++ {
++ char cbuf[5];
++ msg_Dbg(tc, "%s: %s %d*%dx%d : %d*%dx%d", __func__,
++ str_fourcc(cbuf, pic->format.i_chroma),
++ tc->tex_count, tex_width[0], tex_height[0], pic->i_planes, pic->p[0].i_pitch, pic->p[0].i_lines);
++ }
++#endif
++ VLC_UNUSED(tex_width);
++ VLC_UNUSED(tex_height);
++ VLC_UNUSED(plane_offset);
++
++ if (!is_cma_buf_pic_chroma(pic->format.i_chroma))
++ {
++ char cbuf[5];
++ msg_Err(tc, "Pic with unexpected chroma: %s", str_fourcc(cbuf, pic->format.i_chroma));
++ return VLC_EGENERIC;
++ }
++
++ cma_buf_t * const cb = cma_buf_pic_get(pic);
++ if (cb == NULL)
++ {
++ msg_Err(tc, "Pic missing cma buf");
++ return VLC_EGENERIC;
++ }
++
++ tex_context_t * const tex = get_tex_context(tc, pic, cb);
++ if (tex == NULL)
++ return VLC_EGENERIC;
++
++// tc->vt->BindTexture(GL_TEXTURE_EXTERNAL_OES, tex->texture);
++
++ cma_buf_unref(sys->last_cb);
++ sys->last_cb = cma_buf_ref(cb);
++
++ textures[0] = tex->texture;
++ return VLC_SUCCESS;
++}
++
++static int
++tc_mmal_fetch_locations(opengl_tex_converter_t *tc, GLuint program)
++{
++ tc->uloc.Texture[0] = tc->vt->GetUniformLocation(program, "Texture0");
++ return tc->uloc.Texture[0] != -1 ? VLC_SUCCESS : VLC_EGENERIC;
++}
++
++static void
++tc_mmal_prepare_shader(const opengl_tex_converter_t *tc,
++ const GLsizei *tex_width, const GLsizei *tex_height,
++ float alpha)
++{
++ (void) tex_width; (void) tex_height; (void) alpha;
++ VLC_UNUSED(tc);
++// tc->vt->Uniform1i(tc->uloc.Texture[0], 0);
++}
++
++static GLuint
++tc_fragment_shader_init(opengl_tex_converter_t * const tc, const GLenum tex_target,
++ const vlc_fourcc_t chroma, const video_color_space_t yuv_space)
++{
++ VLC_UNUSED(yuv_space);
++
++ tc->tex_count = 1;
++ tc->tex_target = tex_target;
++ tc->texs[0] = (struct opengl_tex_cfg) {
++ { 1, 1 }, { 1, 1 }, GL_RGB, chroma, GL_UNSIGNED_SHORT //** ??
++ };
++
++ tc->pf_fetch_locations = tc_mmal_fetch_locations;
++ tc->pf_prepare_shader = tc_mmal_prepare_shader;
++
++
++ const char fs[] =
++ "#extension GL_OES_EGL_image_external : enable\n"
++ "precision mediump float;\n"
++ "uniform samplerExternalOES Texture0;\n"
++ "varying vec2 TexCoord0;\n"
++ "void main() {\n"
++ " gl_FragColor = texture2D(Texture0, TexCoord0);\n"
++ "}\n";
++
++
++ const char *code = fs;
++
++ GLuint fragment_shader = tc->vt->CreateShader(GL_FRAGMENT_SHADER);
++ tc->vt->ShaderSource(fragment_shader, 1, &code, NULL);
++ tc->vt->CompileShader(fragment_shader);
++ return fragment_shader;
++}
++
++
++static void
++CloseGLConverter(vlc_object_t *obj)
++{
++ opengl_tex_converter_t * const tc = (opengl_tex_converter_t *)obj;
++ mmal_gl_converter_t * const sys = tc->priv;
++
++ if (sys == NULL)
++ return;
++
++ cma_buf_unref(sys->last_cb);
++ cma_vcsm_exit(sys->vcsm_init_type);
++ free(sys);
++}
++
++
++// Pick a chroma that we can convert to
++// Prefer I420 as smallest
++static vlc_fourcc_t chroma_in_out(const vlc_fourcc_t chroma_in)
++{
++ switch (chroma_in)
++ {
++ case VLC_CODEC_MMAL_OPAQUE:
++ case VLC_CODEC_MMAL_ZC_I420:
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ case VLC_CODEC_MMAL_ZC_SAND10: // ISP only
++ return VLC_CODEC_MMAL_ZC_I420;
++ case VLC_CODEC_MMAL_ZC_SAND30: // HVS only
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ return VLC_CODEC_MMAL_ZC_RGB32; // HVS can't generate YUV of any sort
++ default:
++ break;
++ }
++ return 0;
++}
++
++
++static int
++OpenGLConverter(vlc_object_t *obj)
++{
++ opengl_tex_converter_t * const tc = (opengl_tex_converter_t *)obj;
++ int rv = VLC_EGENERIC;
++ const EGLint eglfmt = vlc_to_gl_fourcc(&tc->fmt);
++ const vlc_fourcc_t chroma_out = chroma_in_out(tc->fmt.i_chroma);
++
++ // Do we know what to do with this?
++ if (chroma_out == 0)
++ return rv;
++
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5];
++ msg_Dbg(tc, "<<< %s: V:%s/E:%s,%dx%d [(%d,%d) %d/%d] sar:%d/%d -> %s", __func__,
++ str_fourcc(dbuf0, tc->fmt.i_chroma),
++ str_fourcc(dbuf1, eglfmt),
++ tc->fmt.i_width, tc->fmt.i_height,
++ tc->fmt.i_x_offset, tc->fmt.i_y_offset,
++ tc->fmt.i_visible_width, tc->fmt.i_visible_height,
++ tc->fmt.i_sar_num, tc->fmt.i_sar_den,
++ str_fourcc(dbuf2, chroma_out));
++ }
++
++ if (tc->gl->ext != VLC_GL_EXT_EGL ||
++ !tc->gl->egl.createImageKHR || !tc->gl->egl.destroyImageKHR)
++ {
++ // Missing an important callback
++ msg_Dbg(tc, "Missing EGL xxxImageKHR calls");
++ return rv;
++ }
++
++ if ((tc->priv = calloc(1, sizeof(mmal_gl_converter_t))) == NULL)
++ {
++ msg_Err(tc, "priv alloc failure");
++ rv = VLC_ENOMEM;
++ goto fail;
++ }
++ mmal_gl_converter_t * const sys = tc->priv;
++
++ sys->drm_fourcc = eglfmt;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) != VCSM_INIT_CMA) {
++ msg_Dbg(tc, "VCSM init failed");
++ goto fail;
++ }
++
++ if ((sys->glEGLImageTargetTexture2DOES = vlc_gl_GetProcAddress(tc->gl, "glEGLImageTargetTexture2DOES")) == NULL)
++ {
++ msg_Err(tc, "Failed to bind GL fns");
++ goto fail;
++ }
++
++ if ((tc->fshader = tc_fragment_shader_init(tc, GL_TEXTURE_EXTERNAL_OES,
++ eglfmt == 0 ? VLC_CODEC_RGB32 : tc->fmt.i_chroma,
++ eglfmt == 0 ? COLOR_SPACE_SRGB : tc->fmt.space)) == 0)
++ {
++ msg_Err(tc, "Failed to make shader");
++ goto fail;
++ }
++
++ if (eglfmt == 0)
++ {
++ tc->fmt.i_chroma = chroma_out;
++ tc->fmt.i_bits_per_pixel = 8;
++ if (tc->fmt.i_chroma == VLC_CODEC_MMAL_ZC_RGB32)
++ {
++ tc->fmt.i_rmask = 0xff0000;
++ tc->fmt.i_gmask = 0xff00;
++ tc->fmt.i_bmask = 0xff;
++ tc->fmt.space = COLOR_SPACE_SRGB;
++ }
++ else
++ {
++ tc->fmt.i_rmask = 0;
++ tc->fmt.i_gmask = 0;
++ tc->fmt.i_bmask = 0;
++ tc->fmt.space = COLOR_SPACE_UNDEF;
++ }
++ sys->drm_fourcc = vlc_to_gl_fourcc(&tc->fmt);
++ }
++
++ tc->handle_texs_gen = true; // We manage the texs
++ tc->pf_update = tc_mmal_update;
++
++#if TRACE_ALL
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5];
++ msg_Dbg(tc, ">>> %s: V:%s/E:%s,%dx%d [(%d,%d) %d/%d] sar:%d/%d -> %s", __func__,
++ str_fourcc(dbuf0, tc->fmt.i_chroma),
++ str_fourcc(dbuf1, sys->drm_fourcc),
++ tc->fmt.i_width, tc->fmt.i_height,
++ tc->fmt.i_x_offset, tc->fmt.i_y_offset,
++ tc->fmt.i_visible_width, tc->fmt.i_visible_height,
++ tc->fmt.i_sar_num, tc->fmt.i_sar_den,
++ str_fourcc(dbuf2, chroma_out));
++ }
++#endif
++
++ return VLC_SUCCESS;
++
++fail:
++ CloseGLConverter(obj);
++ return rv;
++}
++
++vlc_module_begin ()
++ set_description("MMAL OpenGL surface converter")
++ set_shortname (N_("MMALGLConverter"))
++ set_capability("glconv", 900)
++ set_callbacks(OpenGLConverter, CloseGLConverter)
++ set_category(CAT_VIDEO)
++ set_subcategory(SUBCAT_VIDEO_VOUT)
++ add_shortcut("mmal_gl_converter")
++vlc_module_end ()
++
+--- a/modules/hw/mmal/deinterlace.c
++++ b/modules/hw/mmal/deinterlace.c
+@@ -26,11 +26,12 @@
+ #include "config.h"
+ #endif
+
+-#include <vlc_picture_pool.h>
++#include <stdatomic.h>
++
+ #include <vlc_common.h>
++#include <vlc_picture_pool.h>
+ #include <vlc_plugin.h>
+ #include <vlc_filter.h>
+-#include <vlc_atomic.h>
+
+ #include "mmal_picture.h"
+
+@@ -39,468 +40,814 @@
+ #include <interface/mmal/util/mmal_util.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+
+-#define MIN_NUM_BUFFERS_IN_TRANSIT 2
++#define MMAL_DEINTERLACE_NO_QPU "mmal-deinterlace-no-qpu"
++#define MMAL_DEINTERLACE_NO_QPU_TEXT N_("Do not use QPUs for advanced HD deinterlacing.")
++#define MMAL_DEINTERLACE_NO_QPU_LONGTEXT N_("Do not make use of the QPUs to allow higher quality deinterlacing of HD content.")
+
+-#define MMAL_DEINTERLACE_QPU "mmal-deinterlace-adv-qpu"
+-#define MMAL_DEINTERLACE_QPU_TEXT N_("Use QPUs for advanced HD deinterlacing.")
+-#define MMAL_DEINTERLACE_QPU_LONGTEXT N_("Make use of the QPUs to allow higher quality deinterlacing of HD content.")
++#define MMAL_DEINTERLACE_ADV "mmal-deinterlace-adv"
++#define MMAL_DEINTERLACE_ADV_TEXT N_("Force advanced deinterlace")
++#define MMAL_DEINTERLACE_ADV_LONGTEXT N_("Force advanced deinterlace")
+
+-static int Open(filter_t *filter);
+-static void Close(filter_t *filter);
++#define MMAL_DEINTERLACE_FAST "mmal-deinterlace-fast"
++#define MMAL_DEINTERLACE_FAST_TEXT N_("Force fast deinterlace")
++#define MMAL_DEINTERLACE_FAST_LONGTEXT N_("Force fast deinterlace")
+
+-vlc_module_begin()
+- set_shortname(N_("MMAL deinterlace"))
+- set_description(N_("MMAL-based deinterlace filter plugin"))
+- set_capability("video filter", 0)
+- set_category(CAT_VIDEO)
+- set_subcategory(SUBCAT_VIDEO_VFILTER)
+- set_callbacks(Open, Close)
+- add_shortcut("deinterlace")
+- add_bool(MMAL_DEINTERLACE_QPU, false, MMAL_DEINTERLACE_QPU_TEXT,
+- MMAL_DEINTERLACE_QPU_LONGTEXT, true);
+-vlc_module_end()
++#define MMAL_DEINTERLACE_NONE "mmal-deinterlace-none"
++#define MMAL_DEINTERLACE_NONE_TEXT N_("Force no deinterlace")
++#define MMAL_DEINTERLACE_NONE_LONGTEXT N_("Force no interlace. Simply strips off the interlace markers and passes the frame straight through. "\
++ "This is the default for > SD if < 96M gpu-mem")
++
++#define MMAL_DEINTERLACE_HALF_RATE "mmal-deinterlace-half-rate"
++#define MMAL_DEINTERLACE_HALF_RATE_TEXT N_("Halve output framerate")
++#define MMAL_DEINTERLACE_HALF_RATE_LONGTEXT N_("Halve output framerate. 1 output frame for each pair of interlaced fields input")
++
++#define MMAL_DEINTERLACE_FULL_RATE "mmal-deinterlace-full-rate"
++#define MMAL_DEINTERLACE_FULL_RATE_TEXT N_("Full output framerate")
++#define MMAL_DEINTERLACE_FULL_RATE_LONGTEXT N_("Full output framerate. 1 output frame for each interlaced field input")
+
+-struct filter_sys_t {
++
++typedef struct filter_sys_t
++{
+ MMAL_COMPONENT_T *component;
+ MMAL_PORT_T *input;
+ MMAL_PORT_T *output;
++ MMAL_POOL_T *in_pool;
++
++ MMAL_QUEUE_T * out_q;
++
++ // Bind this lot somehow into ppr????
++ bool is_cma;
++ cma_buf_pool_t * cma_out_pool;
++ MMAL_POOL_T * out_pool;
++
++ hw_mmal_port_pool_ref_t *out_ppr;
++
++ bool half_rate;
++ bool use_qpu;
++ bool use_fast;
++ bool use_passthrough;
++ unsigned int seq_in; // Seq of next frame to submit (1-15) [Init=1]
++ unsigned int seq_out; // Seq of last frame received (1-15) [Init=15]
+
+- MMAL_QUEUE_T *filtered_pictures;
+- vlc_sem_t sem;
++ vcsm_init_type_t vcsm_init_type;
+
+- atomic_bool started;
++} filter_sys_t;
+
+- /* statistics */
+- int output_in_transit;
+- int input_in_transit;
+-};
+-
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static picture_t *deinterlace(filter_t *filter, picture_t *picture);
+-static void flush(filter_t *filter);
+
+ #define MMAL_COMPONENT_DEFAULT_DEINTERLACE "vc.ril.image_fx"
+
+-static int Open(filter_t *filter)
++#define TRACE_ALL 0
++
++
++
++// Buffer attached to pic on success, is still valid on failure
++static picture_t * di_alloc_opaque(filter_t * const p_filter, MMAL_BUFFER_HEADER_T * const buf)
+ {
+- int32_t frame_duration = filter->fmt_in.video.i_frame_rate != 0 ?
+- (int64_t)1000000 * filter->fmt_in.video.i_frame_rate_base /
+- filter->fmt_in.video.i_frame_rate : 0;
+- bool use_qpu = var_InheritBool(filter, MMAL_DEINTERLACE_QPU);
++ filter_sys_t *const filter_sys = p_filter->p_sys;
++ picture_t * const pic = filter_NewPicture(p_filter);
+
+- MMAL_PARAMETER_IMAGEFX_PARAMETERS_T imfx_param = {
+- { MMAL_PARAMETER_IMAGE_EFFECT_PARAMETERS, sizeof(imfx_param) },
+- MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV,
+- 4,
+- { 3, frame_duration, 0, use_qpu }
+- };
++ if (pic == NULL)
++ goto fail1;
+
+- int ret = VLC_SUCCESS;
+- MMAL_STATUS_T status;
+- filter_sys_t *sys;
++ if (buf->length == 0) {
++ msg_Err(p_filter, "%s: Empty buffer", __func__);
++ goto fail2;
++ }
+
+- msg_Dbg(filter, "Try to open mmal_deinterlace filter. frame_duration: %d, QPU %s!",
+- frame_duration, use_qpu ? "used" : "unused");
++ if ((pic->context = hw_mmal_gen_context(buf, filter_sys->out_ppr)) == NULL)
++ goto fail2;
+
+- if (filter->fmt_in.video.i_chroma != VLC_CODEC_MMAL_OPAQUE)
+- return VLC_EGENERIC;
++ buf_to_pic_copy_props(pic, buf);
+
+- if (filter->fmt_out.video.i_chroma != VLC_CODEC_MMAL_OPAQUE)
+- return VLC_EGENERIC;
++#if TRACE_ALL
++ msg_Dbg(p_filter, "pic: prog=%d, tff=%d, date=%lld", pic->b_progressive, pic->b_top_field_first, (long long)pic->date);
++#endif
+
+- sys = calloc(1, sizeof(filter_sys_t));
+- if (!sys)
+- return VLC_ENOMEM;
+- filter->p_sys = sys;
++ return pic;
+
+- bcm_host_init();
++fail2:
++ picture_Release(pic);
++fail1:
++// mmal_buffer_header_release(buf);
++ return NULL;
++}
+
+- status = mmal_component_create(MMAL_COMPONENT_DEFAULT_DEINTERLACE, &sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++static void di_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++#if TRACE_ALL
++ pic_ctx_mmal_t * ctx = buffer->user_data;
++// filter_sys_t *const sys = ((filter_t *)port->userdata)->p_sys;
++
++ msg_Dbg((filter_t *)port->userdata, "<<< %s: cmd=%d, ctx=%p, buf=%p, flags=%#x, pts=%lld", __func__, buffer->cmd, ctx, buffer,
++ buffer->flags, (long long)buffer->pts);
++#else
++ VLC_UNUSED(port);
++#endif
+
+- status = mmal_port_parameter_set(sys->component->output[0], &imfx_param.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to configure MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++ mmal_buffer_header_release(buffer);
+
+- sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
+- status = mmal_port_enable(sys->component->control, control_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable control port %s (status=%"PRIx32" %s)",
+- sys->component->control->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if TRACE_ALL
++ msg_Dbg((filter_t *)port->userdata, ">>> %s", __func__);
++#endif
++}
++
++static void di_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++ if (buf->cmd == 0 && buf->length != 0)
++ {
++ // The filter structure etc. should always exist if we have contents
++ // but might not on later flushes as we shut down
++ filter_t * const p_filter = (filter_t *)port->userdata;
++ filter_sys_t * const sys = p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s: cmd=%d; flags=%#x, pts=%lld", __func__, buf->cmd, buf->flags, (long long) buf->pts);
++#endif
++ mmal_queue_put(sys->out_q, buf);
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: out Q len=%d", __func__, mmal_queue_length(sys->out_q));
++#endif
++ return;
+ }
+
+- sys->input = sys->component->input[0];
+- sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
+- if (filter->fmt_in.i_codec == VLC_CODEC_MMAL_OPAQUE)
+- sys->input->format->encoding = MMAL_ENCODING_OPAQUE;
+- sys->input->format->es->video.width = filter->fmt_in.video.i_width;
+- sys->input->format->es->video.height = filter->fmt_in.video.i_height;
+- sys->input->format->es->video.crop.x = 0;
+- sys->input->format->es->video.crop.y = 0;
+- sys->input->format->es->video.crop.width = filter->fmt_in.video.i_width;
+- sys->input->format->es->video.crop.height = filter->fmt_in.video.i_height;
+- sys->input->format->es->video.par.num = filter->fmt_in.video.i_sar_num;
+- sys->input->format->es->video.par.den = filter->fmt_in.video.i_sar_den;
++ mmal_buffer_header_reset(buf); // User data stays intact so release will kill pic
++ mmal_buffer_header_release(buf);
++}
+
+- es_format_Copy(&filter->fmt_out, &filter->fmt_in);
+- filter->fmt_out.video.i_frame_rate *= 2;
+
+- status = mmal_port_format_commit(sys->input);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
+- sys->input->buffer_size = sys->input->buffer_size_recommended;
+- sys->input->buffer_num = sys->input->buffer_num_recommended;
+
+- if (filter->fmt_in.i_codec == VLC_CODEC_MMAL_OPAQUE) {
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++static MMAL_STATUS_T fill_output_from_q(filter_t * const p_filter, filter_sys_t * const sys, MMAL_QUEUE_T * const q)
++{
++ MMAL_BUFFER_HEADER_T * out_buf;
+
+- status = mmal_port_parameter_set(sys->input, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- goto out;
++ while ((out_buf = mmal_queue_get(q)) != NULL)
++ {
++ MMAL_STATUS_T err;
++ if ((err = mmal_port_send_buffer(sys->output, out_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to output failed");
++ mmal_queue_put_back(q, out_buf);
++ return err;
+ }
+ }
++ return MMAL_SUCCESS;
++}
+
+- status = mmal_port_enable(sys->input, input_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++// Output buffers may contain a pic ref on error or flush
++// Free it
++static MMAL_BOOL_T out_buffer_pre_release_cb(MMAL_BUFFER_HEADER_T *header, void *userdata)
++{
++ VLC_UNUSED(userdata);
+
+- sys->output = sys->component->output[0];
+- sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
+- mmal_format_full_copy(sys->output->format, sys->input->format);
++ cma_buf_t * const cb = header->user_data;
++ header->user_data = NULL;
++ cma_buf_unref(cb); // Copes fine with NULL
+
+- status = mmal_port_format_commit(sys->output);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to commit format for output port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ return MMAL_FALSE;
++}
++
++static inline unsigned int seq_inc(unsigned int x)
++{
++ return x + 1 >= 16 ? 1 : x + 1;
++}
++
++static inline unsigned int seq_delta(unsigned int sseq, unsigned int fseq)
++{
++ return fseq == 0 ? 0 : fseq <= sseq ? sseq - fseq : 15 - (fseq - sseq);
++}
++
++static picture_t *deinterlace(filter_t * p_filter, picture_t * p_pic)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++ picture_t *ret_pics = NULL;
++ MMAL_STATUS_T err;
++ MMAL_BUFFER_HEADER_T * out_buf = NULL;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ if (hw_mmal_vlc_pic_to_mmal_fmt_update(sys->input->format, p_pic))
++ {
++ // ****** Breaks on opaque (at least)
++
++ if (sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++#if 0
++ if (sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++ mmal_format_full_copy(sys->output->format, sys->input->format);
++ mmal_port_format_commit(sys->output);
++ sys->output->buffer_num = 30;
++ sys->output->buffer_size = sys->input->buffer_size_recommended;
++ mmal_port_enable(sys->output, di_output_port_cb);
++#endif
++ if (mmal_port_format_commit(sys->input) != MMAL_SUCCESS)
++ msg_Err(p_filter, "Failed to update pic format");
++ sys->input->buffer_num = 30;
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ mmal_log_dump_format(sys->input->format);
++ }
++
++ // Reenable stuff if the last thing we did was flush
++ // Output should always be enabled
++ if (!sys->input->is_enabled &&
++ (err = mmal_port_enable(sys->input, di_input_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Input port reenable failed");
++ goto fail;
++ }
++
++ if (!sys->is_cma)
++ {
++ // Fill output from anything that has turned up in pool Q
++ if (hw_mmal_port_pool_ref_fill(sys->out_ppr) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Out port fill fail");
++ goto fail;
++ }
+ }
++ else
++ {
++ // We are expecting one in - one out so simply wedge a new bufer
++ // into the output port. Flow control will happen on cma alloc.
++
++ if ((out_buf = mmal_queue_get(sys->out_pool->queue)) == NULL)
++ {
++ // Should never happen
++ msg_Err(p_filter, "Failed to get output buffer");
++ goto fail;
++ }
++ mmal_buffer_header_reset(out_buf);
+
+- sys->output->buffer_num = 3;
++ // Attach cma_buf to the buffer & ensure it is freed when the buffer is released
++ // On a good send callback the pic will be extracted to avoid this
++ mmal_buffer_header_pre_release_cb_set(out_buf, out_buffer_pre_release_cb, p_filter);
++
++ cma_buf_t * const cb = cma_buf_pool_alloc_buf(sys->cma_out_pool, sys->output->buffer_size);
++ if ((out_buf->user_data = cb) == NULL) // Check & attach cb to buf
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to alloc CMA buf: fmt=%s, size=%d",
++ str_fourcc(dbuf0, p_pic->format.i_chroma),
++ sys->output->buffer_size);
++ goto fail;
++ }
++ const unsigned int vc_h = cma_buf_vc_handle(cb); // Cannot coerce without going via variable
++ out_buf->data = (uint8_t *)vc_h;
++ out_buf->alloc_size = sys->output->buffer_size;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Out buf send: pic=%p, data=%p, user=%p, flags=%#x, len=%d/%d, pts=%lld",
++ p_pic, out_buf->data, out_buf->user_data, out_buf->flags,
++ out_buf->length, out_buf->alloc_size, (long long)out_buf->pts);
++#endif
+
+- if (filter->fmt_in.i_codec == VLC_CODEC_MMAL_OPAQUE) {
+- MMAL_PARAMETER_UINT32_T extra_buffers = {
+- { MMAL_PARAMETER_EXTRA_BUFFERS, sizeof(MMAL_PARAMETER_UINT32_T) },
+- 5
+- };
+- status = mmal_port_parameter_set(sys->output, &extra_buffers.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to set MMAL_PARAMETER_EXTRA_BUFFERS on output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- goto out;
++ if ((err = mmal_port_send_buffer(sys->output, out_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to output failed");
++ goto fail;
+ }
++ out_buf = NULL;
++ }
+
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++ // Stuff into input
++ // We assume the BH is already set up with values reflecting pic date etc.
++ {
++ MMAL_BUFFER_HEADER_T * const pic_buf = hw_mmal_pic_buf_replicated(p_pic, sys->in_pool);
++
++ if (pic_buf == NULL)
++ {
++ msg_Err(p_filter, "Pic has not attached buffer");
++ goto fail;
++ }
+
+- status = mmal_port_parameter_set(sys->output, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- goto out;
++ picture_Release(p_pic);
++
++ // Add a sequence to the flags so we can track what we have actually
++ // deinterlaced
++ pic_buf->flags = (pic_buf->flags & ~(0xfU * MMAL_BUFFER_HEADER_FLAG_USER0)) | (sys->seq_in * (MMAL_BUFFER_HEADER_FLAG_USER0));
++ sys->seq_in = seq_inc(sys->seq_in);
++
++ if ((err = mmal_port_send_buffer(sys->input, pic_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to input failed");
++ mmal_buffer_header_release(pic_buf);
++ goto fail;
+ }
+ }
+
+- status = mmal_port_enable(sys->output, output_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable output port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ // Return anything that is in the out Q
++ {
++ picture_t ** pp_pic = &ret_pics;
++
++ // Advanced di has a 3 frame latency, so if the seq delta is greater
++ // than that then we are expecting at least two frames of output. Wait
++ // for one of those.
++ // seq_in is seq of the next frame we are going to submit (1-15, no 0)
++ // seq_out is last frame we removed from Q
++ // So after 4 frames sent (1st time we want to wait), 0 rx seq_in=5, seq_out=15, delta=5
++
++ while ((out_buf = (seq_delta(sys->seq_in, sys->seq_out) >= 5 ? mmal_queue_timedwait(sys->out_q, 1000) : mmal_queue_get(sys->out_q))) != NULL)
++ {
++ const unsigned int seq_out = (out_buf->flags / MMAL_BUFFER_HEADER_FLAG_USER0) & 0xf;
++ int rv;
++
++ picture_t * out_pic;
++
++ if (sys->is_cma)
++ {
++ // Alloc pic
++ if ((out_pic = filter_NewPicture(p_filter)) == NULL)
++ {
++ // Can't alloc pic - just stop extraction
++ mmal_queue_put_back(sys->out_q, out_buf);
++ out_buf = NULL;
++ msg_Warn(p_filter, "Failed to alloc new filter output pic");
++ break;
++ }
++
++ // Extract cma_buf from buf & attach to pic
++ cma_buf_t * const cb = (cma_buf_t *)out_buf->user_data;
++ if ((rv = cma_buf_pic_attach(cb, out_pic)) != VLC_SUCCESS)
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to attach CMA to pic: fmt=%s err=%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ rv);
++ // cb still attached to buffer and will be freed with it
++ goto fail;
++ }
++ out_buf->user_data = NULL;
++
++ buf_to_pic_copy_props(out_pic, out_buf);
++
++ // Set pic data pointers from buf aux info now it has it
++ if ((rv = cma_pic_set_data(out_pic, sys->output->format, out_buf)) != VLC_SUCCESS)
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to set data: fmt=%s, rv=%d",
++ str_fourcc(dbuf0, sys->output->format->encoding),
++ rv);
++ }
++
++ out_buf->user_data = NULL; // Responsability for this pic no longer with buffer
++ mmal_buffer_header_release(out_buf);
++ }
++ else
++ {
++ out_pic = di_alloc_opaque(p_filter, out_buf);
++
++ if (out_pic == NULL) {
++ msg_Warn(p_filter, "Failed to alloc new filter output pic");
++ mmal_queue_put_back(sys->out_q, out_buf); // Wedge buf back into Q in the hope we can alloc a pic later
++ out_buf = NULL;
++ break;
++ }
++ }
++ out_buf = NULL; // Now attached to pic or recycled
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "-- %s: Q pic=%p: seq_in=%d, seq_out=%d, delta=%d", __func__, out_pic, sys->seq_in, seq_out, seq_delta(sys->seq_in, seq_out));
++#endif
++
++ *pp_pic = out_pic;
++ pp_pic = &out_pic->p_next;
++
++ // Ignore 0 seqs
++ // Don't think these should actually happen
++ if (seq_out != 0)
++ sys->seq_out = seq_out;
++ }
++
++ // Crash on lockup
++ assert(ret_pics != NULL || seq_delta(sys->seq_in, sys->seq_out) < 5);
+ }
+
+- status = mmal_component_enable(sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable component %s (status=%"PRIx32" %s)",
+- sys->component->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: pic=%p", __func__, ret_pics);
++#endif
++
++ return ret_pics;
++
++fail:
++ if (out_buf != NULL)
++ mmal_buffer_header_release(out_buf);
++ picture_Release(p_pic);
++ return NULL;
++}
++
++static void di_flush(filter_t *p_filter)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ if (sys->input != NULL && sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->output != NULL && sys->output->is_enabled)
++ {
++ if (sys->is_cma)
++ {
++ MMAL_BUFFER_HEADER_T * buf;
++ mmal_port_disable(sys->output);
++ while ((buf = mmal_queue_get(sys->out_q)) != NULL)
++ mmal_buffer_header_release(buf);
++ }
++ else
++ {
++ // Wedge anything we've got into the output port as that will free the underlying buffers
++ fill_output_from_q(p_filter, sys, sys->out_q);
++
++ mmal_port_disable(sys->output);
++
++ // If that dumped anything real into the out_q then have another go
++ if (mmal_queue_length(sys->out_q) != 0)
++ {
++ mmal_port_enable(sys->output, di_output_port_cb);
++ fill_output_from_q(p_filter, sys, sys->out_q);
++ mmal_port_disable(sys->output);
++ // Out q should now be empty & should remain so until the input is reenabled
++ }
++ }
++ mmal_port_enable(sys->output, di_output_port_cb);
++
++ // Leaving the input disabled is fine - but we want to leave the output enabled
++ // so we can retrieve buffers that are still bound to pictures
+ }
+
+- sys->filtered_pictures = mmal_queue_create();
++ sys->seq_in = 1;
++ sys->seq_out = 15;
+
+- filter->pf_video_filter = deinterlace;
+- filter->pf_flush = flush;
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s", __func__);
++#endif
++}
+
+- vlc_sem_init(&sys->sem, 0);
+
+-out:
+- if (ret != VLC_SUCCESS)
+- Close(filter);
++static void pass_flush(filter_t *p_filter)
++{
++ // Nothing to do
++ VLC_UNUSED(p_filter);
++}
+
+- return ret;
++static picture_t * pass_deinterlace(filter_t * p_filter, picture_t * p_pic)
++{
++ VLC_UNUSED(p_filter);
++
++ p_pic->b_progressive = true;
++ return p_pic;
+ }
+
+-static void Close(filter_t *filter)
++
++static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+ {
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ filter_t *filter = (filter_t *)port->userdata;
++ MMAL_STATUS_T status;
+
+- if (!sys)
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ msg_Err(filter, "MMAL error %"PRIx32" \"%s\"", status,
++ mmal_status_to_string(status));
++ }
++
++ mmal_buffer_header_reset(buffer);
++ mmal_buffer_header_release(buffer);
++}
++
++static void CloseMmalDeinterlace(filter_t *filter)
++{
++ filter_sys_t * const sys = filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(filter, "<<< %s", __func__);
++#endif
++
++ if (sys == NULL)
+ return;
+
+- if (sys->component && sys->component->control->is_enabled)
+- mmal_port_disable(sys->component->control);
++ if (sys->use_passthrough)
++ {
++ free(sys);
++ return;
++ }
+
+- if (sys->input && sys->input->is_enabled)
+- mmal_port_disable(sys->input);
++ di_flush(filter);
+
+- if (sys->output && sys->output->is_enabled)
+- mmal_port_disable(sys->output);
++ if (sys->component && sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
+
+ if (sys->component && sys->component->is_enabled)
+ mmal_component_disable(sys->component);
+
+- while ((buffer = mmal_queue_get(sys->filtered_pictures))) {
+- picture_t *pic = (picture_t *)buffer->user_data;
+- picture_Release(pic);
++ if (sys->in_pool != NULL)
++ mmal_pool_destroy(sys->in_pool);
++
++ hw_mmal_port_pool_ref_release(sys->out_ppr, false);
++ // Once we exit filter & sys are invalid so mark as such
++ if (sys->output != NULL)
++ sys->output->userdata = NULL;
++
++ if (sys->is_cma)
++ {
++ if (sys->output && sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++
++ if (sys->out_pool != NULL)
++ mmal_pool_destroy(sys->out_pool);
+ }
+
+- if (sys->filtered_pictures)
+- mmal_queue_destroy(sys->filtered_pictures);
++ if (sys->out_q != NULL)
++ mmal_queue_destroy(sys->out_q);
+
+ if (sys->component)
+ mmal_component_release(sys->component);
+
+- vlc_sem_destroy(&sys->sem);
++ cma_vcsm_exit(sys->vcsm_init_type);
++
+ free(sys);
++}
++
+
+- bcm_host_deinit();
++static bool is_fmt_valid_in(const vlc_fourcc_t fmt)
++{
++ return fmt == VLC_CODEC_MMAL_OPAQUE ||
++ fmt == VLC_CODEC_MMAL_ZC_I420 ||
++ fmt == VLC_CODEC_MMAL_ZC_SAND8;
+ }
+
+-static int send_output_buffer(filter_t *filter)
++static int OpenMmalDeinterlace(filter_t *filter)
+ {
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ int32_t frame_duration = filter->fmt_in.video.i_frame_rate != 0 ?
++ CLOCK_FREQ * filter->fmt_in.video.i_frame_rate_base /
++ filter->fmt_in.video.i_frame_rate : 0;
++
++ int ret = VLC_EGENERIC;
+ MMAL_STATUS_T status;
+- picture_t *picture;
+- int ret = 0;
++ filter_sys_t *sys;
++
++ msg_Dbg(filter, "<<< %s", __func__);
++
++ if (!is_fmt_valid_in(filter->fmt_in.video.i_chroma) ||
++ filter->fmt_out.video.i_chroma != filter->fmt_in.video.i_chroma)
++ return VLC_EGENERIC;
+
+- if (!sys->output->is_enabled) {
+- ret = VLC_EGENERIC;
+- goto out;
++ sys = calloc(1, sizeof(filter_sys_t));
++ if (!sys)
++ return VLC_ENOMEM;
++ filter->p_sys = sys;
++
++ sys->seq_in = 1;
++ sys->seq_out = 15;
++ sys->is_cma = is_cma_buf_pic_chroma(filter->fmt_out.video.i_chroma);
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if (rpi_is_model_pi4())
++ {
++ sys->half_rate = true;
++ sys->use_qpu = false;
++ sys->use_fast = true;
++ }
++ else
++ {
++ sys->half_rate = false;
++ sys->use_qpu = true;
++ sys->use_fast = false;
++ }
++ sys->use_passthrough = false;
++
++ if (filter->fmt_in.video.i_width * filter->fmt_in.video.i_height > 768 * 576)
++ {
++ // We get stressed if we have to try too hard - so make life easier
++ sys->half_rate = true;
++ // Also check we actually have enough memory to do this
++ // Memory always comes from GPU if Opaque
++ // Assume we have plenty of memory if it comes from CMA
++ if ((!sys->is_cma || sys->vcsm_init_type == VCSM_INIT_LEGACY) &&
++ hw_mmal_get_gpu_mem() < (96 << 20))
++ {
++ sys->use_passthrough = true;
++ msg_Warn(filter, "Deinterlace bypassed due to lack of GPU memory");
++ }
+ }
+
+- picture = filter_NewPicture(filter);
+- if (!picture) {
+- msg_Warn(filter, "Failed to get new picture");
+- ret = -1;
+- goto out;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_NO_QPU))
++ sys->use_qpu = false;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_ADV))
++ {
++ sys->use_fast = false;
++ sys->use_passthrough = false;
++ }
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_FAST))
++ {
++ sys->use_fast = true;
++ sys->use_passthrough = false;
++ }
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_NONE))
++ sys->use_passthrough = true;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_FULL_RATE))
++ sys->half_rate = false;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_HALF_RATE))
++ sys->half_rate = true;
++
++ if (sys->use_passthrough)
++ {
++ filter->pf_video_filter = pass_deinterlace;
++ filter->pf_flush = pass_flush;
++ // Don't need VCSM - get rid of it now
++ cma_vcsm_exit(sys->vcsm_init_type);
++ sys->vcsm_init_type = VCSM_INIT_NONE;
++ return 0;
++ }
++
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(filter, "%s: %s,%dx%d [(%d,%d) %d/%d] -> %s,%dx%d [(%d,%d) %dx%d]: %s %s %s", __func__,
++ str_fourcc(dbuf0, filter->fmt_in.video.i_chroma),
++ filter->fmt_in.video.i_width, filter->fmt_in.video.i_height,
++ filter->fmt_in.video.i_x_offset, filter->fmt_in.video.i_y_offset,
++ filter->fmt_in.video.i_visible_width, filter->fmt_in.video.i_visible_height,
++ str_fourcc(dbuf1, filter->fmt_out.video.i_chroma),
++ filter->fmt_out.video.i_width, filter->fmt_out.video.i_height,
++ filter->fmt_out.video.i_x_offset, filter->fmt_out.video.i_y_offset,
++ filter->fmt_out.video.i_visible_width, filter->fmt_out.video.i_visible_height,
++ sys->use_qpu ? "QPU" : "VPU",
++ sys->use_fast ? "FAST" : "ADV",
++ sys->use_passthrough ? "PASS" : sys->half_rate ? "HALF" : "FULL");
+ }
+- picture->format.i_frame_rate = filter->fmt_out.video.i_frame_rate;
+- picture->format.i_frame_rate_base = filter->fmt_out.video.i_frame_rate_base;
+
+- buffer = picture->p_sys->buffer;
+- buffer->user_data = picture;
+- buffer->cmd = 0;
++ status = mmal_component_create(MMAL_COMPONENT_DEFAULT_DEINTERLACE, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
++ goto fail;
++ }
+
+- mmal_picture_lock(picture);
++ {
++ const MMAL_PARAMETER_IMAGEFX_PARAMETERS_T imfx_param = {
++ { MMAL_PARAMETER_IMAGE_EFFECT_PARAMETERS, sizeof(imfx_param) },
++ sys->use_fast ?
++ MMAL_PARAM_IMAGEFX_DEINTERLACE_FAST :
++ MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV,
++ 4,
++ { 5 /* Frame type: mixed */, frame_duration, sys->half_rate, sys->use_qpu }
++ };
+
+- status = mmal_port_send_buffer(sys->output, buffer);
++ status = mmal_port_parameter_set(sys->component->output[0], &imfx_param.hdr);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to configure MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ }
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
++ status = mmal_port_enable(sys->component->control, control_port_cb);
+ if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to send buffer to output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- mmal_buffer_header_release(buffer);
+- picture_Release(picture);
+- ret = -1;
+- } else {
+- atomic_fetch_add(&sys->output_in_transit, 1);
+- vlc_sem_post(&sys->sem);
++ msg_Err(filter, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
+ }
+
+-out:
+- return ret;
+-}
++ sys->input = sys->component->input[0];
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
++ sys->input->format->encoding = vlc_to_mmal_video_fourcc(&filter->fmt_in.video);
++ hw_mmal_vlc_fmt_to_mmal_fmt(sys->input->format, &filter->fmt_in.video);
+
+-static void fill_output_port(filter_t *filter)
+-{
+- filter_sys_t *sys = filter->p_sys;
+- /* allow at least 2 buffers in transit */
+- unsigned max_buffers_in_transit = __MAX(2, MIN_NUM_BUFFERS_IN_TRANSIT);
+- int buffers_available = sys->output->buffer_num -
+- atomic_load(&sys->output_in_transit) -
+- mmal_queue_length(sys->filtered_pictures);
+- int buffers_to_send = max_buffers_in_transit - sys->output_in_transit;
+- int i;
++ es_format_Copy(&filter->fmt_out, &filter->fmt_in);
++ if (!sys->half_rate)
++ filter->fmt_out.video.i_frame_rate *= 2;
+
+- if (buffers_to_send > buffers_available)
+- buffers_to_send = buffers_available;
++ status = mmal_port_format_commit(sys->input);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ sys->input->buffer_num = 30;
++// sys->input->buffer_num = sys->input->buffer_num_recommended;
+
+-#ifndef NDEBUG
+- msg_Dbg(filter, "Send %d buffers to output port (available: %d, in_transit: %d, buffer_num: %d)",
+- buffers_to_send, buffers_available, sys->output_in_transit,
+- sys->output->buffer_num);
+-#endif
+- for (i = 0; i < buffers_to_send; ++i) {
+- if (send_output_buffer(filter) < 0)
+- break;
++ if ((sys->in_pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(filter, "Failed to create input pool");
++ goto fail;
+ }
+-}
+
+-static picture_t *deinterlace(filter_t *filter, picture_t *picture)
+-{
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
+- picture_t *out_picture = NULL;
+- picture_t *ret = NULL;
+- MMAL_STATUS_T status;
+- unsigned i = 0;
++ status = port_parameter_set_bool(sys->input, MMAL_PARAMETER_ZERO_COPY, true);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
+
+- fill_output_port(filter);
++ status = mmal_port_enable(sys->input, di_input_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
+
+- buffer = picture->p_sys->buffer;
+- buffer->user_data = picture;
+- buffer->pts = picture->date;
+- buffer->cmd = 0;
+
+- if (!picture->p_sys->displayed) {
+- status = mmal_port_send_buffer(sys->input, buffer);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to send buffer to input port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- picture_Release(picture);
+- } else {
+- picture->p_sys->displayed = true;
+- atomic_fetch_add(&sys->input_in_transit, 1);
+- vlc_sem_post(&sys->sem);
+- }
+- } else {
+- picture_Release(picture);
+- }
+-
+- /*
+- * Send output buffers
+- */
+- while(atomic_load(&sys->started) && i < 2) {
+- if (buffer = mmal_queue_timedwait(sys->filtered_pictures, 2000)) {
+- i++;
+- if (!out_picture) {
+- out_picture = (picture_t *)buffer->user_data;
+- ret = out_picture;
+- } else {
+- out_picture->p_next = (picture_t *)buffer->user_data;
+- out_picture = out_picture->p_next;
+- }
+- out_picture->date = buffer->pts;
+- } else {
+- msg_Dbg(filter, "Failed waiting for filtered picture");
+- break;
+- }
++ if ((sys->out_q = mmal_queue_create()) == NULL)
++ {
++ msg_Err(filter, "Failed to create out Q");
++ goto fail;
+ }
+- if (out_picture)
+- out_picture->p_next = NULL;
+
+- return ret;
+-}
+-
+-static void flush(filter_t *filter)
+-{
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ sys->output = sys->component->output[0];
++ mmal_format_full_copy(sys->output->format, sys->input->format);
+
+- msg_Dbg(filter, "flush deinterlace filter");
++ if (!sys->is_cma)
++ {
++ if ((status = hw_mmal_opaque_output(VLC_OBJECT(filter), &sys->out_ppr, sys->output, 5, di_output_port_cb)) != MMAL_SUCCESS)
++ goto fail;
++ }
++ else
++ {
++ // CMA stuff
++ sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
++
++ if ((sys->cma_out_pool = cma_buf_pool_new(8, 8, true, "deinterlace")) == NULL)
++ {
++ msg_Err(filter, "Failed to alloc cma buf pool");
++ goto fail;
++ }
+
+- msg_Dbg(filter, "flush: flush ports (input: %d, output: %d in transit)",
+- sys->input_in_transit, sys->output_in_transit);
+- mmal_port_flush(sys->output);
+- mmal_port_flush(sys->input);
+-
+- msg_Dbg(filter, "flush: wait for all buffers to be returned");
+- while (atomic_load(&sys->input_in_transit) ||
+- atomic_load(&sys->output_in_transit))
+- vlc_sem_wait(&sys->sem);
+-
+- while ((buffer = mmal_queue_get(sys->filtered_pictures))) {
+- picture_t *pic = (picture_t *)buffer->user_data;
+- msg_Dbg(filter, "flush: release already filtered pic %p",
+- (void *)pic);
+- picture_Release(pic);
+- }
+- atomic_store(&sys->started, false);
+- msg_Dbg(filter, "flush: done");
+-}
++ // Rate control done by CMA in flight logic, so have "inexhaustable" pool here
++ if ((sys->out_pool = mmal_pool_create(30, 0)) == NULL)
++ {
++ msg_Err(filter, "Failed to alloc out pool");
++ goto fail;
++ }
+
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+-{
+- filter_t *filter = (filter_t *)port->userdata;
+- MMAL_STATUS_T status;
++ port_parameter_set_bool(sys->output, MMAL_PARAMETER_ZERO_COPY, true);
+
+- if (buffer->cmd == MMAL_EVENT_ERROR) {
+- status = *(uint32_t *)buffer->data;
+- msg_Err(filter, "MMAL error %"PRIx32" \"%s\"", status,
+- mmal_status_to_string(status));
+- }
++ if ((status = mmal_port_format_commit(sys->output)) != MMAL_SUCCESS)
++ {
++ msg_Err(filter, "Output port format commit failed");
++ goto fail;
++ }
+
+- mmal_buffer_header_release(buffer);
+-}
++ sys->output->buffer_num = 30;
++ sys->output->buffer_size = sys->output->buffer_size_recommended;
+
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+-{
+- picture_t *picture = (picture_t *)buffer->user_data;
+- filter_t *filter = (filter_t *)port->userdata;
+- filter_sys_t *sys = filter->p_sys;
++ // CB just drops all bufs into out_q
++ if ((status = mmal_port_enable(sys->output, di_output_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(filter, "Failed to enable output port %s (status=%"PRIx32" %s)",
++ sys->output->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ }
+
+- if (picture) {
+- picture_Release(picture);
+- } else {
+- msg_Warn(filter, "Got buffer without picture on input port - OOOPS");
+- mmal_buffer_header_release(buffer);
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
+ }
+
+- atomic_fetch_sub(&sys->input_in_transit, 1);
+- vlc_sem_post(&sys->sem);
++ filter->pf_video_filter = deinterlace;
++ filter->pf_flush = di_flush;
++ return 0;
++
++fail:
++ CloseMmalDeinterlace(filter);
++ return ret;
+ }
+
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+-{
+- filter_t *filter = (filter_t *)port->userdata;
+- filter_sys_t *sys = filter->p_sys;
+- picture_t *picture;
++vlc_module_begin()
++ set_shortname(N_("MMAL deinterlace"))
++ set_description(N_("MMAL-based deinterlace filter plugin"))
++ set_capability("video filter", 900)
++ set_category(CAT_VIDEO)
++ set_subcategory(SUBCAT_VIDEO_VFILTER)
++ set_callbacks(OpenMmalDeinterlace, CloseMmalDeinterlace)
++ add_shortcut("deinterlace")
++ add_bool(MMAL_DEINTERLACE_NO_QPU, false, MMAL_DEINTERLACE_NO_QPU_TEXT,
++ MMAL_DEINTERLACE_NO_QPU_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_ADV, false, MMAL_DEINTERLACE_ADV_TEXT,
++ MMAL_DEINTERLACE_ADV_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_FAST, false, MMAL_DEINTERLACE_FAST_TEXT,
++ MMAL_DEINTERLACE_FAST_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_NONE, false, MMAL_DEINTERLACE_NONE_TEXT,
++ MMAL_DEINTERLACE_NONE_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_HALF_RATE, false, MMAL_DEINTERLACE_HALF_RATE_TEXT,
++ MMAL_DEINTERLACE_HALF_RATE_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_FULL_RATE, false, MMAL_DEINTERLACE_FULL_RATE_TEXT,
++ MMAL_DEINTERLACE_FULL_RATE_LONGTEXT, true);
++
++vlc_module_end()
++
+
+- if (buffer->cmd == 0) {
+- if (buffer->length > 0) {
+- atomic_store(&sys->started, true);
+- mmal_queue_put(sys->filtered_pictures, buffer);
+- picture = (picture_t *)buffer->user_data;
+- } else {
+- picture = (picture_t *)buffer->user_data;
+- picture_Release(picture);
+- }
+-
+- atomic_fetch_sub(&sys->output_in_transit, 1);
+- vlc_sem_post(&sys->sem);
+- } else if (buffer->cmd == MMAL_EVENT_FORMAT_CHANGED) {
+- msg_Warn(filter, "MMAL_EVENT_FORMAT_CHANGED seen but not handled");
+- mmal_buffer_header_release(buffer);
+- } else {
+- mmal_buffer_header_release(buffer);
+- }
+-}
+--- /dev/null
++++ b/modules/hw/mmal/mmal_avcodec.c
+@@ -0,0 +1,2175 @@
++/*****************************************************************************
++ * video.c: video decoder using the libavcodec library
++ *****************************************************************************
++ * Copyright (C) 1999-2001 VLC authors and VideoLAN
++ * $Id$
++ *
++ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
++ * Gildas Bazin <gbazin@videolan.org>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU Lesser General Public License as published by
++ * the Free Software Foundation; either version 2.1 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public License
++ * along with this program; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
++ *****************************************************************************/
++
++/*****************************************************************************
++ * Preamble
++ *****************************************************************************/
++#include "config.h"
++
++#include <vlc_common.h>
++#include <vlc_codec.h>
++#include <vlc_avcodec.h>
++#include <vlc_cpu.h>
++#include <vlc_atomic.h>
++#include <assert.h>
++
++#include <libavcodec/avcodec.h>
++#include <libavutil/mem.h>
++#include <libavutil/pixdesc.h>
++#if (LIBAVUTIL_VERSION_MICRO >= 100 && LIBAVUTIL_VERSION_INT >= AV_VERSION_INT( 55, 16, 101 ) )
++#include <libavutil/mastering_display_metadata.h>
++#endif
++
++//#include "avcodec.h"
++//#include "va.h"
++
++#include <vlc_plugin.h>
++#include <libavutil/rpi_sand_fns.h>
++#include <libavcodec/rpi_zc.h>
++#include "../../codec/cc.h"
++#include "../../codec/avcodec/avcommon.h" // ??? Beware over inclusion
++#include "mmal_cma.h"
++#include "mmal_picture.h"
++
++#define TRACE_ALL 0
++
++#define BUFFERS_IN_FLIGHT 5 // Default max value for in flight buffers
++#define BUFFERS_IN_FLIGHT_UHD 3 // Fewer if very big
++
++#define MMAL_AVCODEC_BUFFERS "mmal-avcodec-buffers"
++#define MMAL_AVCODEC_BUFFERS_TEXT N_("In flight buffer count before blocking.")
++#define MMAL_AVCODEC_BUFFERS_LONGTEXT N_("In flight buffer count before blocking. " \
++"Beware that incautious changing of this can lead to lockup. " \
++"Zero will disable the module.")
++
++
++// Fwd declarations required due to wanting to avoid reworking the original
++// code too much
++static void MmalAvcodecCloseDecoder( vlc_object_t *obj );
++
++
++/*****************************************************************************
++ * decoder_sys_t : decoder descriptor
++ *****************************************************************************/
++struct decoder_sys_t
++{
++ AVCodecContext *p_context;
++ const AVCodec *p_codec;
++
++ /* Video decoder specific part */
++ date_t pts;
++
++ /* Closed captions for decoders */
++ cc_data_t cc;
++
++ /* for frame skipping algo */
++ bool b_hurry_up;
++ bool b_show_corrupted;
++ bool b_from_preroll;
++ enum AVDiscard i_skip_frame;
++
++ /* how many decoded frames are late */
++ int i_late_frames;
++ mtime_t i_late_frames_start;
++ mtime_t i_last_late_delay;
++
++ /* for direct rendering */
++ bool b_direct_rendering;
++ atomic_bool b_dr_failure;
++
++ /* Hack to force display of still pictures */
++ bool b_first_frame;
++
++
++ /* */
++ bool palette_sent;
++
++ /* VA API */
++// vlc_va_t *p_va;
++ enum PixelFormat pix_fmt;
++ int profile;
++ int level;
++
++ vlc_sem_t sem_mt;
++
++ // Rpi vars
++ cma_buf_pool_t * cma_pool;
++ bool pool_alloc_1;
++ vcsm_init_type_t vcsm_init_type;
++ int cma_in_flight_max;
++ // Debug
++ decoder_t * p_dec;
++};
++
++
++static vlc_fourcc_t
++ZcFindVlcChroma(const int i_ffmpeg_chroma)
++{
++ switch (i_ffmpeg_chroma)
++ {
++ // This is all we claim to deal with
++ // In theory RGB should be doable within our current framework
++ case AV_PIX_FMT_YUV420P:
++ return VLC_CODEC_MMAL_ZC_I420;
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ return VLC_CODEC_MMAL_ZC_SAND8;
++ case AV_PIX_FMT_SAND64_10:
++ return VLC_CODEC_MMAL_ZC_SAND10;
++ case AV_PIX_FMT_RPI4_10:
++ return VLC_CODEC_MMAL_ZC_SAND30;
++ default:
++ break;
++ }
++ return 0;
++}
++
++// Pix Fmt conv for MMal
++// video_fromat from ffmpeg pic_fmt
++static int
++ZcGetVlcChroma( video_format_t *fmt, int i_ffmpeg_chroma )
++{
++ fmt->i_rmask = 0;
++ fmt->i_gmask = 0;
++ fmt->i_bmask = 0;
++ fmt->i_chroma = ZcFindVlcChroma(i_ffmpeg_chroma);
++
++ return fmt->i_chroma == 0 ? -1 : 0;
++}
++
++
++// Format chooser is way simpler than vlc
++static enum PixelFormat
++ZcGetFormat(AVCodecContext *p_context, const enum PixelFormat *pi_fmt)
++{
++ enum PixelFormat swfmt = avcodec_default_get_format(p_context, pi_fmt);
++ for (size_t i = 0; pi_fmt[i] != AV_PIX_FMT_NONE; i++)
++ {
++ if (ZcFindVlcChroma(pi_fmt[i]) != 0)
++ return pi_fmt[i];
++ }
++ return swfmt;
++}
++
++
++static void cma_avbuf_pool_free(void * v)
++{
++ cma_buf_unref(v);
++}
++
++static unsigned int zc_buf_vcsm_handle(void * v)
++{
++ return cma_buf_vcsm_handle(v);
++}
++
++static unsigned int zc_buf_vc_handle(void * v)
++{
++ return cma_buf_vc_handle(v);
++}
++
++static void * zc_buf_map_arm(void * v)
++{
++ return cma_buf_addr(v);
++}
++
++static unsigned int zc_buf_map_vc(void * v)
++{
++ return cma_buf_vc_addr(v);
++}
++
++
++
++static const av_rpi_zc_buf_fn_tab_t zc_buf_fn_tab = {
++ .free = cma_avbuf_pool_free,
++
++ .vcsm_handle = zc_buf_vcsm_handle,
++ .vc_handle = zc_buf_vc_handle,
++ .map_arm = zc_buf_map_arm,
++ .map_vc = zc_buf_map_vc
++};
++
++
++static AVBufferRef *
++zc_alloc_buf(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
++{
++ decoder_t * const dec = v;
++ decoder_sys_t * const sys = dec->p_sys;
++
++ VLC_UNUSED(geo);
++
++ assert(sys != NULL);
++
++ const unsigned int dec_pool_req = av_rpi_zc_get_decoder_pool_size(sys->p_context->opaque);
++ if (dec_pool_req != 0)
++ {
++ cma_buf_pool_resize(sys->cma_pool, dec_pool_req + sys->cma_in_flight_max, sys->cma_in_flight_max);
++
++ if (!sys->pool_alloc_1)
++ {
++ sys->pool_alloc_1 = true;
++ msg_Dbg(dec, "Pool size: (%d+%d) * %zd", dec_pool_req, sys->cma_in_flight_max, size);
++ if (cma_buf_pool_fill(sys->cma_pool, size) != 0)
++ msg_Warn(dec, "Failed to preallocate decoder pool (%d+%d) * %zd", dec_pool_req, sys->cma_in_flight_max, size);
++ }
++ }
++
++ void * const cmabuf = cma_buf_pool_alloc_buf(sys->cma_pool, size);
++
++ if (cmabuf == NULL)
++ {
++ msg_Err(dec, "CMA buf pool alloc buf failed");
++ return NULL;
++ }
++
++ AVBufferRef *const avbuf = av_rpi_zc_buf(cma_buf_size(cmabuf), 0, cmabuf, &zc_buf_fn_tab);
++
++ if (avbuf == NULL)
++ {
++ msg_Err(dec, "av_rpi_zc_buf failed");
++ cma_buf_unref(cmabuf);
++ return NULL;
++ }
++
++ return avbuf;
++}
++
++static void
++zc_free_pool(void * v)
++{
++ decoder_t * const dec = v;
++ cma_buf_pool_delete(dec->p_sys->cma_pool);
++}
++
++
++static const uint8_t shift_01[] = {0,1,1,1};
++static const uint8_t pb_1[] = {1,1,1,1};
++static const uint8_t pb_12[] = {1,2,2,2};
++static const uint8_t pb_24[] = {2,4,4,4};
++static const uint8_t pb_4[] = {4,4,4,4};
++
++static int set_pic_from_frame(picture_t * const pic, const AVFrame * const frame)
++{
++ const uint8_t * hs = shift_01;
++ const uint8_t * ws = shift_01;
++ const uint8_t * pb = pb_1;
++
++ switch (pic->format.i_chroma)
++ {
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ pic->i_planes = 1;
++ pb = pb_4;
++ break;
++ case VLC_CODEC_MMAL_ZC_I420:
++ pic->i_planes = 3;
++ break;
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ pic->i_planes = 2;
++ pb = pb_12;
++ break;
++ case VLC_CODEC_MMAL_ZC_SAND10:
++ case VLC_CODEC_MMAL_ZC_SAND30: // Lies: SAND30 is "special"
++ pic->i_planes = 2;
++ pb = pb_24;
++ break;
++ default:
++ return VLC_EGENERIC;
++ }
++
++ const cma_buf_t * const cb = cma_buf_pic_get(pic);
++ uint8_t * const data = cma_buf_addr(cb);
++ if (data == NULL) {
++ return VLC_ENOMEM;
++ }
++
++ uint8_t * frame_end = frame->data[0] + cma_buf_size(cb);
++ for (int i = 0; i != pic->i_planes; ++i) {
++ // Calculate lines from gap between planes
++ // This will give us an accurate "height" for later use by MMAL
++ const int lines = ((i + 1 == pic->i_planes ? frame_end : frame->data[i + 1]) -
++ frame->data[i]) / frame->linesize[i];
++ pic->p[i] = (plane_t){
++ .p_pixels = data + (frame->data[i] - frame->data[0]),
++ .i_lines = lines,
++ .i_pitch = frame->linesize[i],
++ .i_pixel_pitch = pb[i],
++ .i_visible_lines = av_frame_cropped_height(frame) >> hs[i],
++ .i_visible_pitch = av_frame_cropped_width(frame) >> ws[i]
++ };
++ }
++ return 0;
++}
++
++
++//============================================================================
++//
++// Nicked from avcodec/fourcc.c
++//
++// * Really we should probably use that directly
++
++/*
++ * Video Codecs
++ */
++
++struct vlc_avcodec_fourcc
++{
++ vlc_fourcc_t i_fourcc;
++ unsigned i_codec;
++};
++
++
++static const struct vlc_avcodec_fourcc video_codecs[] =
++{
++ { VLC_CODEC_MP1V, AV_CODEC_ID_MPEG1VIDEO },
++ { VLC_CODEC_MP2V, AV_CODEC_ID_MPEG2VIDEO }, /* prefer MPEG2 over MPEG1 */
++ { VLC_CODEC_MPGV, AV_CODEC_ID_MPEG2VIDEO }, /* prefer MPEG2 over MPEG1 */
++ /* AV_CODEC_ID_MPEG2VIDEO_XVMC */
++ { VLC_CODEC_H261, AV_CODEC_ID_H261 },
++ { VLC_CODEC_H263, AV_CODEC_ID_H263 },
++ { VLC_CODEC_RV10, AV_CODEC_ID_RV10 },
++ { VLC_CODEC_RV13, AV_CODEC_ID_RV10 },
++ { VLC_CODEC_RV20, AV_CODEC_ID_RV20 },
++ { VLC_CODEC_MJPG, AV_CODEC_ID_MJPEG },
++ { VLC_CODEC_MJPGB, AV_CODEC_ID_MJPEGB },
++ { VLC_CODEC_LJPG, AV_CODEC_ID_LJPEG },
++ { VLC_CODEC_SP5X, AV_CODEC_ID_SP5X },
++ { VLC_CODEC_JPEGLS, AV_CODEC_ID_JPEGLS },
++ { VLC_CODEC_MP4V, AV_CODEC_ID_MPEG4 },
++ /* AV_CODEC_ID_RAWVIDEO */
++ { VLC_CODEC_DIV1, AV_CODEC_ID_MSMPEG4V1 },
++ { VLC_CODEC_DIV2, AV_CODEC_ID_MSMPEG4V2 },
++ { VLC_CODEC_DIV3, AV_CODEC_ID_MSMPEG4V3 },
++ { VLC_CODEC_WMV1, AV_CODEC_ID_WMV1 },
++ { VLC_CODEC_WMV2, AV_CODEC_ID_WMV2 },
++ { VLC_CODEC_H263P, AV_CODEC_ID_H263P },
++ { VLC_CODEC_H263I, AV_CODEC_ID_H263I },
++ { VLC_CODEC_FLV1, AV_CODEC_ID_FLV1 },
++ { VLC_CODEC_SVQ1, AV_CODEC_ID_SVQ1 },
++ { VLC_CODEC_SVQ3, AV_CODEC_ID_SVQ3 },
++ { VLC_CODEC_DV, AV_CODEC_ID_DVVIDEO },
++ { VLC_CODEC_HUFFYUV, AV_CODEC_ID_HUFFYUV },
++ { VLC_CODEC_CYUV, AV_CODEC_ID_CYUV },
++ { VLC_CODEC_H264, AV_CODEC_ID_H264 },
++ { VLC_CODEC_INDEO3, AV_CODEC_ID_INDEO3 },
++ { VLC_CODEC_VP3, AV_CODEC_ID_VP3 },
++ { VLC_CODEC_THEORA, AV_CODEC_ID_THEORA },
++#if ( !defined( WORDS_BIGENDIAN ) )
++ /* Asus Video (Another thing that doesn't work on PPC) */
++ { VLC_CODEC_ASV1, AV_CODEC_ID_ASV1 },
++ { VLC_CODEC_ASV2, AV_CODEC_ID_ASV2 },
++#endif
++ { VLC_CODEC_FFV1, AV_CODEC_ID_FFV1 },
++ { VLC_CODEC_4XM, AV_CODEC_ID_4XM },
++ { VLC_CODEC_VCR1, AV_CODEC_ID_VCR1 },
++ { VLC_CODEC_CLJR, AV_CODEC_ID_CLJR },
++ { VLC_CODEC_MDEC, AV_CODEC_ID_MDEC },
++ { VLC_CODEC_ROQ, AV_CODEC_ID_ROQ },
++ { VLC_CODEC_INTERPLAY, AV_CODEC_ID_INTERPLAY_VIDEO },
++ { VLC_CODEC_XAN_WC3, AV_CODEC_ID_XAN_WC3 },
++ { VLC_CODEC_XAN_WC4, AV_CODEC_ID_XAN_WC4 },
++ { VLC_CODEC_RPZA, AV_CODEC_ID_RPZA },
++ { VLC_CODEC_CINEPAK, AV_CODEC_ID_CINEPAK },
++ { VLC_CODEC_WS_VQA, AV_CODEC_ID_WS_VQA },
++ { VLC_CODEC_MSRLE, AV_CODEC_ID_MSRLE },
++ { VLC_CODEC_MSVIDEO1, AV_CODEC_ID_MSVIDEO1 },
++ { VLC_CODEC_IDCIN, AV_CODEC_ID_IDCIN },
++ { VLC_CODEC_8BPS, AV_CODEC_ID_8BPS },
++ { VLC_CODEC_SMC, AV_CODEC_ID_SMC },
++ { VLC_CODEC_FLIC, AV_CODEC_ID_FLIC },
++ { VLC_CODEC_TRUEMOTION1, AV_CODEC_ID_TRUEMOTION1 },
++ { VLC_CODEC_VMDVIDEO, AV_CODEC_ID_VMDVIDEO },
++ { VLC_CODEC_LCL_MSZH, AV_CODEC_ID_MSZH },
++ { VLC_CODEC_LCL_ZLIB, AV_CODEC_ID_ZLIB },
++ { VLC_CODEC_QTRLE, AV_CODEC_ID_QTRLE },
++ { VLC_CODEC_TSCC, AV_CODEC_ID_TSCC },
++ { VLC_CODEC_ULTI, AV_CODEC_ID_ULTI },
++ { VLC_CODEC_QDRAW, AV_CODEC_ID_QDRAW },
++ { VLC_CODEC_VIXL, AV_CODEC_ID_VIXL },
++ { VLC_CODEC_QPEG, AV_CODEC_ID_QPEG },
++ { VLC_CODEC_PNG, AV_CODEC_ID_PNG },
++ { VLC_CODEC_PPM, AV_CODEC_ID_PPM },
++ /* AV_CODEC_ID_PBM */
++ { VLC_CODEC_PGM, AV_CODEC_ID_PGM },
++ { VLC_CODEC_PGMYUV, AV_CODEC_ID_PGMYUV },
++ { VLC_CODEC_PAM, AV_CODEC_ID_PAM },
++ { VLC_CODEC_FFVHUFF, AV_CODEC_ID_FFVHUFF },
++ { VLC_CODEC_RV30, AV_CODEC_ID_RV30 },
++ { VLC_CODEC_RV40, AV_CODEC_ID_RV40 },
++ { VLC_CODEC_VC1, AV_CODEC_ID_VC1 },
++ { VLC_CODEC_WMVA, AV_CODEC_ID_VC1 },
++ { VLC_CODEC_WMV3, AV_CODEC_ID_WMV3 },
++ { VLC_CODEC_WMVP, AV_CODEC_ID_WMV3 },
++ { VLC_CODEC_LOCO, AV_CODEC_ID_LOCO },
++ { VLC_CODEC_WNV1, AV_CODEC_ID_WNV1 },
++ { VLC_CODEC_AASC, AV_CODEC_ID_AASC },
++ { VLC_CODEC_INDEO2, AV_CODEC_ID_INDEO2 },
++ { VLC_CODEC_FRAPS, AV_CODEC_ID_FRAPS },
++ { VLC_CODEC_TRUEMOTION2, AV_CODEC_ID_TRUEMOTION2 },
++ { VLC_CODEC_BMP, AV_CODEC_ID_BMP },
++ { VLC_CODEC_CSCD, AV_CODEC_ID_CSCD },
++ { VLC_CODEC_MMVIDEO, AV_CODEC_ID_MMVIDEO },
++ { VLC_CODEC_ZMBV, AV_CODEC_ID_ZMBV },
++ { VLC_CODEC_AVS, AV_CODEC_ID_AVS },
++ { VLC_CODEC_SMACKVIDEO, AV_CODEC_ID_SMACKVIDEO },
++ { VLC_CODEC_NUV, AV_CODEC_ID_NUV },
++ { VLC_CODEC_KMVC, AV_CODEC_ID_KMVC },
++ { VLC_CODEC_FLASHSV, AV_CODEC_ID_FLASHSV },
++ { VLC_CODEC_CAVS, AV_CODEC_ID_CAVS },
++ { VLC_CODEC_JPEG2000, AV_CODEC_ID_JPEG2000 },
++ { VLC_CODEC_VMNC, AV_CODEC_ID_VMNC },
++ { VLC_CODEC_VP5, AV_CODEC_ID_VP5 },
++ { VLC_CODEC_VP6, AV_CODEC_ID_VP6 },
++ { VLC_CODEC_VP6F, AV_CODEC_ID_VP6F },
++ { VLC_CODEC_TARGA, AV_CODEC_ID_TARGA },
++ { VLC_CODEC_DSICINVIDEO, AV_CODEC_ID_DSICINVIDEO },
++ { VLC_CODEC_TIERTEXSEQVIDEO, AV_CODEC_ID_TIERTEXSEQVIDEO },
++ { VLC_CODEC_TIFF, AV_CODEC_ID_TIFF },
++ { VLC_CODEC_GIF, AV_CODEC_ID_GIF },
++ { VLC_CODEC_DXA, AV_CODEC_ID_DXA },
++ { VLC_CODEC_DNXHD, AV_CODEC_ID_DNXHD },
++ { VLC_CODEC_THP, AV_CODEC_ID_THP },
++ { VLC_CODEC_SGI, AV_CODEC_ID_SGI },
++ { VLC_CODEC_C93, AV_CODEC_ID_C93 },
++ { VLC_CODEC_BETHSOFTVID, AV_CODEC_ID_BETHSOFTVID },
++ /* AV_CODEC_ID_PTX */
++ { VLC_CODEC_TXD, AV_CODEC_ID_TXD },
++ { VLC_CODEC_VP6A, AV_CODEC_ID_VP6A },
++ { VLC_CODEC_AMV, AV_CODEC_ID_AMV },
++ { VLC_CODEC_VB, AV_CODEC_ID_VB },
++ { VLC_CODEC_PCX, AV_CODEC_ID_PCX },
++ /* AV_CODEC_ID_SUNRAST */
++ { VLC_CODEC_INDEO4, AV_CODEC_ID_INDEO4 },
++ { VLC_CODEC_INDEO5, AV_CODEC_ID_INDEO5 },
++ { VLC_CODEC_MIMIC, AV_CODEC_ID_MIMIC },
++ { VLC_CODEC_RL2, AV_CODEC_ID_RL2 },
++ { VLC_CODEC_ESCAPE124, AV_CODEC_ID_ESCAPE124 },
++ { VLC_CODEC_DIRAC, AV_CODEC_ID_DIRAC },
++ { VLC_CODEC_BFI, AV_CODEC_ID_BFI },
++ { VLC_CODEC_CMV, AV_CODEC_ID_CMV },
++ { VLC_CODEC_MOTIONPIXELS, AV_CODEC_ID_MOTIONPIXELS },
++ { VLC_CODEC_TGV, AV_CODEC_ID_TGV },
++ { VLC_CODEC_TGQ, AV_CODEC_ID_TGQ },
++ { VLC_CODEC_TQI, AV_CODEC_ID_TQI },
++ { VLC_CODEC_AURA, AV_CODEC_ID_AURA },
++ /* AV_CODEC_ID_AURA2 */
++ /* AV_CODEC_ID_V210X */
++ { VLC_CODEC_TMV, AV_CODEC_ID_TMV },
++ { VLC_CODEC_V210, AV_CODEC_ID_V210 },
++#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT( 54, 50, 100 ) && LIBAVCODEC_VERSION_MICRO >= 100
++ { VLC_CODEC_VUYA, AV_CODEC_ID_AYUV },
++#endif
++ /* AV_CODEC_ID_DPX */
++ { VLC_CODEC_MAD, AV_CODEC_ID_MAD },
++ { VLC_CODEC_FRWU, AV_CODEC_ID_FRWU },
++ { VLC_CODEC_FLASHSV2, AV_CODEC_ID_FLASHSV2 },
++ /* AV_CODEC_ID_CDGRAPHICS */
++ /* AV_CODEC_ID_R210 */
++ { VLC_CODEC_ANM, AV_CODEC_ID_ANM },
++ { VLC_CODEC_BINKVIDEO, AV_CODEC_ID_BINKVIDEO },
++ /* AV_CODEC_ID_IFF_ILBM */
++ /* AV_CODEC_ID_IFF_BYTERUN1 */
++ { VLC_CODEC_KGV1, AV_CODEC_ID_KGV1 },
++ { VLC_CODEC_YOP, AV_CODEC_ID_YOP },
++ { VLC_CODEC_VP8, AV_CODEC_ID_VP8 },
++ /* AV_CODEC_ID_PICTOR */
++ /* AV_CODEC_ID_ANSI */
++ /* AV_CODEC_ID_A64_MULTI */
++ /* AV_CODEC_ID_A64_MULTI5 */
++ /* AV_CODEC_ID_R10K */
++ { VLC_CODEC_MXPEG, AV_CODEC_ID_MXPEG },
++ { VLC_CODEC_LAGARITH, AV_CODEC_ID_LAGARITH },
++ { VLC_CODEC_PRORES, AV_CODEC_ID_PRORES },
++ { VLC_CODEC_JV, AV_CODEC_ID_JV },
++ { VLC_CODEC_DFA, AV_CODEC_ID_DFA },
++ { VLC_CODEC_WMVP, AV_CODEC_ID_WMV3IMAGE },
++ { VLC_CODEC_WMVP2, AV_CODEC_ID_VC1IMAGE },
++ { VLC_CODEC_UTVIDEO, AV_CODEC_ID_UTVIDEO },
++ { VLC_CODEC_BMVVIDEO, AV_CODEC_ID_BMV_VIDEO },
++ { VLC_CODEC_VBLE, AV_CODEC_ID_VBLE },
++ { VLC_CODEC_DXTORY, AV_CODEC_ID_DXTORY },
++ /* AV_CODEC_ID_V410 */
++ /* AV_CODEC_ID_XWD */
++ { VLC_CODEC_CDXL, AV_CODEC_ID_CDXL },
++ /* AV_CODEC_ID_XBM */
++ /* AV_CODEC_ID_ZEROCODEC */
++ { VLC_CODEC_MSS1, AV_CODEC_ID_MSS1 },
++ { VLC_CODEC_MSA1, AV_CODEC_ID_MSA1 },
++ { VLC_CODEC_TSC2, AV_CODEC_ID_TSCC2 },
++ { VLC_CODEC_MTS2, AV_CODEC_ID_MTS2 },
++ { VLC_CODEC_CLLC, AV_CODEC_ID_CLLC },
++ { VLC_CODEC_MSS2, AV_CODEC_ID_MSS2 },
++ { VLC_CODEC_VP9, AV_CODEC_ID_VP9 },
++#if LIBAVCODEC_VERSION_CHECK( 57, 26, 0, 83, 101 )
++ { VLC_CODEC_AV1, AV_CODEC_ID_AV1 },
++#endif
++ { VLC_CODEC_ICOD, AV_CODEC_ID_AIC },
++ /* AV_CODEC_ID_ESCAPE130 */
++ { VLC_CODEC_G2M4, AV_CODEC_ID_G2M },
++ { VLC_CODEC_G2M2, AV_CODEC_ID_G2M },
++ { VLC_CODEC_G2M3, AV_CODEC_ID_G2M },
++ /* AV_CODEC_ID_WEBP */
++ { VLC_CODEC_HNM4_VIDEO, AV_CODEC_ID_HNM4_VIDEO },
++ { VLC_CODEC_HEVC, AV_CODEC_ID_HEVC },
++
++ { VLC_CODEC_FIC , AV_CODEC_ID_FIC },
++ /* AV_CODEC_ID_ALIAS_PIX */
++ /* AV_CODEC_ID_BRENDER_PIX */
++ /* AV_CODEC_ID_PAF_VIDEO */
++ /* AV_CODEC_ID_EXR */
++
++ { VLC_CODEC_VP7 , AV_CODEC_ID_VP7 },
++ /* AV_CODEC_ID_SANM */
++ /* AV_CODEC_ID_SGIRLE */
++ /* AV_CODEC_ID_MVC1 */
++ /* AV_CODEC_ID_MVC2 */
++ { VLC_CODEC_HQX, AV_CODEC_ID_HQX },
++
++ { VLC_CODEC_TDSC, AV_CODEC_ID_TDSC },
++
++ { VLC_CODEC_HQ_HQA, AV_CODEC_ID_HQ_HQA },
++
++ { VLC_CODEC_HAP, AV_CODEC_ID_HAP },
++ /* AV_CODEC_ID_DDS */
++
++ { VLC_CODEC_DXV, AV_CODEC_ID_DXV },
++
++ /* ffmpeg only: AV_CODEC_ID_BRENDER_PIX */
++ /* ffmpeg only: AV_CODEC_ID_Y41P */
++ /* ffmpeg only: AV_CODEC_ID_EXR */
++ /* ffmpeg only: AV_CODEC_ID_AVRP */
++ /* ffmpeg only: AV_CODEC_ID_012V */
++ /* ffmpeg only: AV_CODEC_ID_AVUI */
++ /* ffmpeg only: AV_CODEC_ID_TARGA_Y216 */
++ /* ffmpeg only: AV_CODEC_ID_V308 */
++ /* ffmpeg only: AV_CODEC_ID_V408 */
++ /* ffmpeg only: AV_CODEC_ID_YUV4 */
++ /* ffmpeg only: AV_CODEC_ID_SANM */
++ /* ffmpeg only: AV_CODEC_ID_PAF_VIDEO */
++ /* ffmpeg only: AV_CODEC_ID_AVRN */
++ /* ffmpeg only: AV_CODEC_ID_CPIA */
++ /* ffmpeg only: AV_CODEC_ID_XFACE */
++ /* ffmpeg only: AV_CODEC_ID_SGIRLE */
++ /* ffmpeg only: AV_CODEC_ID_MVC1 */
++ /* ffmpeg only: AV_CODEC_ID_MVC2 */
++ /* ffmpeg only: AV_CODEC_ID_SNOW */
++ /* ffmpeg only: AV_CODEC_ID_SMVJPEG */
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 24, 102 )
++ { VLC_CODEC_CINEFORM, AV_CODEC_ID_CFHD },
++#endif
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 70, 100 )
++ { VLC_CODEC_PIXLET, AV_CODEC_ID_PIXLET },
++#endif
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 71, 101 )
++ { VLC_CODEC_SPEEDHQ, AV_CODEC_ID_SPEEDHQ },
++#endif
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 79, 100 )
++ { VLC_CODEC_FMVC, AV_CODEC_ID_FMVC },
++#endif
++};
++
++// *** Really we should probably use GetFfmpegCodec with a pre-kludge for the bits we care about
++static bool
++ZcGetFfmpegCodec( enum es_format_category_e cat, vlc_fourcc_t i_fourcc,
++ unsigned *pi_ffmpeg_codec, const char **ppsz_name )
++{
++ const struct vlc_avcodec_fourcc *base;
++ size_t count;
++
++ base = video_codecs;
++ count = ARRAY_SIZE(video_codecs);
++ i_fourcc = vlc_fourcc_GetCodec( cat, i_fourcc );
++
++ for( size_t i = 0; i < count; i++ )
++ {
++ if( base[i].i_fourcc == i_fourcc )
++ {
++ if( pi_ffmpeg_codec != NULL )
++ *pi_ffmpeg_codec = base[i].i_codec;
++ if( ppsz_name )
++ *ppsz_name = vlc_fourcc_GetDescription( cat, i_fourcc );
++ return true;
++ }
++ }
++ return false;
++}
++
++
++
++//============================================================================
++// Derived from codec/avcodec/avcodec.c
++
++static AVCodecContext *
++ZcFfmpeg_AllocContext( decoder_t *p_dec,
++ const AVCodec **restrict codecp )
++{
++ unsigned i_codec_id;
++ const char *psz_namecodec;
++ const AVCodec *p_codec = NULL;
++
++ /* *** determine codec type *** */
++ if( !ZcGetFfmpegCodec( p_dec->fmt_in.i_cat, p_dec->fmt_in.i_codec,
++ &i_codec_id, &psz_namecodec ) )
++ return NULL;
++
++ msg_Dbg( p_dec, "using %s %s", AVPROVIDER(LIBAVCODEC), LIBAVCODEC_IDENT );
++
++ /* Initialization must be done before avcodec_find_decoder() */
++ vlc_init_avcodec(VLC_OBJECT(p_dec));
++
++ /* *** ask ffmpeg for a decoder *** */
++ char *psz_decoder = var_InheritString( p_dec, "avcodec-codec" );
++ if( psz_decoder != NULL )
++ {
++ p_codec = avcodec_find_decoder_by_name( psz_decoder );
++ if( !p_codec )
++ msg_Err( p_dec, "Decoder `%s' not found", psz_decoder );
++ else if( p_codec->id != i_codec_id )
++ {
++ msg_Err( p_dec, "Decoder `%s' can't handle %4.4s",
++ psz_decoder, (char*)&p_dec->fmt_in.i_codec );
++ p_codec = NULL;
++ }
++ free( psz_decoder );
++ }
++ if( !p_codec )
++// p_codec = avcodec_find_decoder( i_codec_id );
++ {
++ if( p_dec->fmt_in.i_codec != VLC_CODEC_HEVC )
++ p_codec = avcodec_find_decoder(i_codec_id);
++ else
++ {
++ psz_namecodec = rpi_is_model_pi4() ? "hevc" : "hevc_rpi";
++ msg_Info(p_dec, "Looking for HEVC decoder '%s'", psz_namecodec);
++ p_codec = avcodec_find_decoder_by_name(psz_namecodec);
++ }
++ }
++
++ if( !p_codec )
++ {
++ msg_Dbg( p_dec, "codec not found (%s)", psz_namecodec );
++ return NULL;
++ }
++
++ *codecp = p_codec;
++
++ /* *** get a p_context *** */
++ AVCodecContext *avctx = avcodec_alloc_context3(p_codec);
++ if( unlikely(avctx == NULL) )
++ return NULL;
++
++ avctx->debug = var_InheritInteger( p_dec, "avcodec-debug" );
++ avctx->opaque = p_dec;
++ return avctx;
++}
++
++/*****************************************************************************
++ * ffmpeg_OpenCodec:
++ *****************************************************************************/
++
++static int
++ZcFfmpeg_OpenCodec( decoder_t *p_dec, AVCodecContext *ctx,
++ const AVCodec *codec )
++{
++ char *psz_opts = var_InheritString( p_dec, "avcodec-options" );
++ AVDictionary *options = NULL;
++ int ret;
++
++ if (psz_opts) {
++ vlc_av_get_options(psz_opts, &options);
++ free(psz_opts);
++ }
++
++ if (av_rpi_zc_init2(ctx, p_dec, zc_alloc_buf, zc_free_pool) != 0)
++ {
++ msg_Err(p_dec, "Failed to init AV ZC");
++ return VLC_EGENERIC;
++ }
++
++ vlc_avcodec_lock();
++ ret = avcodec_open2( ctx, codec, options ? &options : NULL );
++ vlc_avcodec_unlock();
++
++ AVDictionaryEntry *t = NULL;
++ while ((t = av_dict_get(options, "", t, AV_DICT_IGNORE_SUFFIX))) {
++ msg_Err( p_dec, "Unknown option \"%s\"", t->key );
++ }
++ av_dict_free(&options);
++
++ if( ret < 0 )
++ {
++ msg_Err( p_dec, "cannot start codec (%s)", codec->name );
++ return VLC_EGENERIC;
++ }
++
++ msg_Dbg( p_dec, "codec (%s) started", codec->name );
++ return VLC_SUCCESS;
++}
++
++//============================================================================
++// Derived from 3.0.7.1 codec/avcodec/video.c
++
++static inline void wait_mt(decoder_sys_t *sys)
++{
++#if 1
++ // As we only ever update the output in our main thread this lock is
++ // redundant
++ VLC_UNUSED(sys);
++#else
++ vlc_sem_wait(&sys->sem_mt);
++#endif
++}
++
++static inline void post_mt(decoder_sys_t *sys)
++{
++#if 1
++ // As we only ever update the output in our main thread this lock is
++ // redundant
++ VLC_UNUSED(sys);
++#else
++ vlc_sem_post(&sys->sem_mt);
++#endif
++}
++
++/*****************************************************************************
++ * Local prototypes
++ *****************************************************************************/
++static void ffmpeg_InitCodec ( decoder_t * );
++static int DecodeVideo( decoder_t *, block_t * );
++static void Flush( decoder_t * );
++
++static uint32_t ffmpeg_CodecTag( vlc_fourcc_t fcc )
++{
++ uint8_t *p = (uint8_t*)&fcc;
++ return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
++}
++
++/*****************************************************************************
++ * Local Functions
++ *****************************************************************************/
++
++/**
++ * Sets the decoder output format.
++ */
++static int lavc_GetVideoFormat(decoder_t *dec, video_format_t *restrict fmt,
++ AVCodecContext *ctx, enum AVPixelFormat pix_fmt,
++ enum AVPixelFormat sw_pix_fmt)
++{
++ int width = ctx->coded_width;
++ int height = ctx->coded_height;
++
++ video_format_Init(fmt, 0);
++
++#if 1
++ VLC_UNUSED(sw_pix_fmt);
++ if ((fmt->i_chroma = ZcFindVlcChroma(pix_fmt)) == 0)
++ return -1;
++#else
++ if (pix_fmt == sw_pix_fmt)
++ { /* software decoding */
++ int aligns[AV_NUM_DATA_POINTERS];
++
++ if (GetVlcChroma(fmt, pix_fmt))
++ return -1;
++
++ /* The libavcodec palette can only be fetched when the first output
++ * frame is decoded. Assume that the current chroma is RGB32 while we
++ * are waiting for a valid palette. Indeed, fmt_out.video.p_palette
++ * doesn't trigger a new vout request, but a new chroma yes. */
++ if (pix_fmt == AV_PIX_FMT_PAL8 && !dec->fmt_out.video.p_palette)
++ fmt->i_chroma = VLC_CODEC_RGB32;
++
++ avcodec_align_dimensions2(ctx, &width, &height, aligns);
++ }
++ else /* hardware decoding */
++ fmt->i_chroma = vlc_va_GetChroma(pix_fmt, sw_pix_fmt);
++#endif
++
++ if( width == 0 || height == 0 || width > 8192 || height > 8192 ||
++ width < ctx->width || height < ctx->height )
++ {
++ msg_Err(dec, "Invalid frame size %dx%d vsz %dx%d",
++ width, height, ctx->width, ctx->height );
++ return -1; /* invalid display size */
++ }
++
++ fmt->i_width = width;
++ fmt->i_height = height;
++ fmt->i_visible_width = ctx->width;
++ fmt->i_visible_height = ctx->height;
++
++ /* If an aspect-ratio was specified in the input format then force it */
++ if (dec->fmt_in.video.i_sar_num > 0 && dec->fmt_in.video.i_sar_den > 0)
++ {
++ fmt->i_sar_num = dec->fmt_in.video.i_sar_num;
++ fmt->i_sar_den = dec->fmt_in.video.i_sar_den;
++ }
++ else
++ {
++ fmt->i_sar_num = ctx->sample_aspect_ratio.num;
++ fmt->i_sar_den = ctx->sample_aspect_ratio.den;
++
++ if (fmt->i_sar_num == 0 || fmt->i_sar_den == 0)
++ fmt->i_sar_num = fmt->i_sar_den = 1;
++ }
++
++ if (dec->fmt_in.video.i_frame_rate > 0
++ && dec->fmt_in.video.i_frame_rate_base > 0)
++ {
++ fmt->i_frame_rate = dec->fmt_in.video.i_frame_rate;
++ fmt->i_frame_rate_base = dec->fmt_in.video.i_frame_rate_base;
++ }
++ else if (ctx->framerate.num > 0 && ctx->framerate.den > 0)
++ {
++ fmt->i_frame_rate = ctx->framerate.num;
++ fmt->i_frame_rate_base = ctx->framerate.den;
++# if LIBAVCODEC_VERSION_MICRO < 100
++ // for some reason libav don't thinkg framerate presents actually same thing as in ffmpeg
++ fmt->i_frame_rate_base *= __MAX(ctx->ticks_per_frame, 1);
++# endif
++ }
++ else if (ctx->time_base.num > 0 && ctx->time_base.den > 0)
++ {
++ fmt->i_frame_rate = ctx->time_base.den;
++ fmt->i_frame_rate_base = ctx->time_base.num
++ * __MAX(ctx->ticks_per_frame, 1);
++ }
++
++ /* FIXME we should only set the known values and let the core decide
++ * later of fallbacks, but we can't do that with a boolean */
++ switch ( ctx->color_range )
++ {
++ case AVCOL_RANGE_JPEG:
++ fmt->b_color_range_full = true;
++ break;
++ case AVCOL_RANGE_UNSPECIFIED:
++ fmt->b_color_range_full = !vlc_fourcc_IsYUV( fmt->i_chroma );
++ break;
++ case AVCOL_RANGE_MPEG:
++ default:
++ fmt->b_color_range_full = false;
++ break;
++ }
++
++ switch( ctx->colorspace )
++ {
++ case AVCOL_SPC_BT709:
++ fmt->space = COLOR_SPACE_BT709;
++ break;
++ case AVCOL_SPC_SMPTE170M:
++ case AVCOL_SPC_BT470BG:
++ fmt->space = COLOR_SPACE_BT601;
++ break;
++ case AVCOL_SPC_BT2020_NCL:
++ case AVCOL_SPC_BT2020_CL:
++ fmt->space = COLOR_SPACE_BT2020;
++ break;
++ default:
++ break;
++ }
++
++ switch( ctx->color_trc )
++ {
++ case AVCOL_TRC_LINEAR:
++ fmt->transfer = TRANSFER_FUNC_LINEAR;
++ break;
++ case AVCOL_TRC_GAMMA22:
++ fmt->transfer = TRANSFER_FUNC_SRGB;
++ break;
++ case AVCOL_TRC_BT709:
++ fmt->transfer = TRANSFER_FUNC_BT709;
++ break;
++ case AVCOL_TRC_SMPTE170M:
++ case AVCOL_TRC_BT2020_10:
++ case AVCOL_TRC_BT2020_12:
++ fmt->transfer = TRANSFER_FUNC_BT2020;
++ break;
++#if LIBAVUTIL_VERSION_CHECK( 55, 14, 0, 31, 100)
++ case AVCOL_TRC_ARIB_STD_B67:
++ fmt->transfer = TRANSFER_FUNC_ARIB_B67;
++ break;
++#endif
++#if LIBAVUTIL_VERSION_CHECK( 55, 17, 0, 37, 100)
++ case AVCOL_TRC_SMPTE2084:
++ fmt->transfer = TRANSFER_FUNC_SMPTE_ST2084;
++ break;
++ case AVCOL_TRC_SMPTE240M:
++ fmt->transfer = TRANSFER_FUNC_SMPTE_240;
++ break;
++ case AVCOL_TRC_GAMMA28:
++ fmt->transfer = TRANSFER_FUNC_BT470_BG;
++ break;
++#endif
++ default:
++ break;
++ }
++
++ switch( ctx->color_primaries )
++ {
++ case AVCOL_PRI_BT709:
++ fmt->primaries = COLOR_PRIMARIES_BT709;
++ break;
++ case AVCOL_PRI_BT470BG:
++ fmt->primaries = COLOR_PRIMARIES_BT601_625;
++ break;
++ case AVCOL_PRI_SMPTE170M:
++ case AVCOL_PRI_SMPTE240M:
++ fmt->primaries = COLOR_PRIMARIES_BT601_525;
++ break;
++ case AVCOL_PRI_BT2020:
++ fmt->primaries = COLOR_PRIMARIES_BT2020;
++ break;
++ default:
++ break;
++ }
++
++ switch( ctx->chroma_sample_location )
++ {
++ case AVCHROMA_LOC_LEFT:
++ fmt->chroma_location = CHROMA_LOCATION_LEFT;
++ break;
++ case AVCHROMA_LOC_CENTER:
++ fmt->chroma_location = CHROMA_LOCATION_CENTER;
++ break;
++ case AVCHROMA_LOC_TOPLEFT:
++ fmt->chroma_location = CHROMA_LOCATION_TOP_LEFT;
++ break;
++ default:
++ break;
++ }
++
++ return 0;
++}
++
++static int lavc_UpdateVideoFormat(decoder_t *dec, AVCodecContext *ctx,
++ enum AVPixelFormat fmt,
++ enum AVPixelFormat swfmt)
++{
++ video_format_t fmt_out;
++ int val;
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s", __func__);
++#endif
++ val = lavc_GetVideoFormat(dec, &fmt_out, ctx, fmt, swfmt);
++ if (val)
++ {
++ msg_Dbg(dec, "Failed to get format");
++ return val;
++ }
++
++ /* always have date in fields/ticks units */
++ if(dec->p_sys->pts.i_divider_num)
++ date_Change(&dec->p_sys->pts, fmt_out.i_frame_rate *
++ __MAX(ctx->ticks_per_frame, 1),
++ fmt_out.i_frame_rate_base);
++ else
++ date_Init(&dec->p_sys->pts, fmt_out.i_frame_rate *
++ __MAX(ctx->ticks_per_frame, 1),
++ fmt_out.i_frame_rate_base);
++
++ fmt_out.p_palette = dec-> fmt_out.video.p_palette;
++ dec->fmt_out.video.p_palette = NULL;
++
++ es_format_Change(&dec->fmt_out, VIDEO_ES, fmt_out.i_chroma);
++ dec->fmt_out.video = fmt_out;
++ dec->fmt_out.video.orientation = dec->fmt_in.video.orientation;
++ dec->fmt_out.video.projection_mode = dec->fmt_in.video.projection_mode;
++ dec->fmt_out.video.multiview_mode = dec->fmt_in.video.multiview_mode;
++ dec->fmt_out.video.pose = dec->fmt_in.video.pose;
++ if ( dec->fmt_in.video.mastering.max_luminance )
++ dec->fmt_out.video.mastering = dec->fmt_in.video.mastering;
++ dec->fmt_out.video.lighting = dec->fmt_in.video.lighting;
++
++ val = decoder_UpdateVideoFormat(dec);
++#if TRACE_ALL
++ msg_Dbg(dec, ">>> %s: rv=%d", __func__, val);
++#endif
++ return val;
++}
++
++static int OpenVideoCodec( decoder_t *p_dec )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *ctx = p_sys->p_context;
++ const AVCodec *codec = p_sys->p_codec;
++ int ret;
++
++ if( ctx->extradata_size <= 0 )
++ {
++ if( codec->id == AV_CODEC_ID_VC1 ||
++ codec->id == AV_CODEC_ID_THEORA )
++ {
++ msg_Warn( p_dec, "waiting for extra data for codec %s",
++ codec->name );
++ return 1;
++ }
++ }
++
++ ctx->width = p_dec->fmt_in.video.i_visible_width;
++ ctx->height = p_dec->fmt_in.video.i_visible_height;
++
++ ctx->coded_width = p_dec->fmt_in.video.i_width;
++ ctx->coded_height = p_dec->fmt_in.video.i_height;
++
++ ctx->bits_per_coded_sample = p_dec->fmt_in.video.i_bits_per_pixel;
++ p_sys->pix_fmt = AV_PIX_FMT_NONE;
++ p_sys->profile = -1;
++ p_sys->level = -1;
++ cc_Init( &p_sys->cc );
++
++ set_video_color_settings( &p_dec->fmt_in.video, ctx );
++ if( p_dec->fmt_in.video.i_frame_rate_base &&
++ p_dec->fmt_in.video.i_frame_rate &&
++ (double) p_dec->fmt_in.video.i_frame_rate /
++ p_dec->fmt_in.video.i_frame_rate_base < 6 )
++ {
++ ctx->flags |= AV_CODEC_FLAG_LOW_DELAY;
++ }
++
++ post_mt( p_sys );
++ ret = ZcFfmpeg_OpenCodec( p_dec, ctx, codec );
++ wait_mt( p_sys );
++ if( ret < 0 )
++ return ret;
++
++ switch( ctx->active_thread_type )
++ {
++ case FF_THREAD_FRAME:
++ msg_Dbg( p_dec, "using frame thread mode with %d threads",
++ ctx->thread_count );
++ break;
++ case FF_THREAD_SLICE:
++ msg_Dbg( p_dec, "using slice thread mode with %d threads",
++ ctx->thread_count );
++ break;
++ case 0:
++ if( ctx->thread_count > 1 )
++ msg_Warn( p_dec, "failed to enable threaded decoding" );
++ break;
++ default:
++ msg_Warn( p_dec, "using unknown thread mode with %d threads",
++ ctx->thread_count );
++ break;
++ }
++ return 0;
++}
++
++/*****************************************************************************
++ * InitVideo: initialize the video decoder
++ *****************************************************************************
++ * the ffmpeg codec will be opened, some memory allocated. The vout is not yet
++ * opened (done after the first decoded frame).
++ *****************************************************************************/
++static int MmalAvcodecOpenDecoder( vlc_object_t *obj )
++{
++ decoder_t *p_dec = (decoder_t *)obj;
++ const AVCodec *p_codec;
++
++ int extra_buffers = var_InheritInteger(p_dec, MMAL_AVCODEC_BUFFERS);
++
++ if (extra_buffers < 0)
++ {
++ extra_buffers = p_dec->fmt_in.video.i_height * p_dec->fmt_in.video.i_width >= 1920 * 1088 ?
++ BUFFERS_IN_FLIGHT_UHD : BUFFERS_IN_FLIGHT;
++ }
++
++ if (extra_buffers <= 0)
++ {
++ msg_Dbg(p_dec, "%s: extra_buffers=%d - cannot use module", __func__, extra_buffers);
++ return VLC_EGENERIC;
++ }
++
++ const vcsm_init_type_t vcsm_type = cma_vcsm_init();
++ const int vcsm_size =
++ vcsm_type == VCSM_INIT_LEGACY ? hw_mmal_get_gpu_mem() : 512 << 20;
++
++#if 1
++ {
++ char buf1[5], buf2[5], buf2a[5];
++ char buf3[5], buf4[5];
++ uint32_t in_fcc = 0;
++ msg_Dbg(p_dec, "%s: <<< (%s/%s)[%s] %dx%d -> (%s/%s) %dx%d [%s/%d] xb:%d", __func__,
++ str_fourcc(buf1, p_dec->fmt_in.i_codec),
++ str_fourcc(buf2, p_dec->fmt_in.video.i_chroma),
++ str_fourcc(buf2a, in_fcc),
++ p_dec->fmt_in.video.i_width, p_dec->fmt_in.video.i_height,
++ str_fourcc(buf3, p_dec->fmt_out.i_codec),
++ str_fourcc(buf4, p_dec->fmt_out.video.i_chroma),
++ p_dec->fmt_out.video.i_width, p_dec->fmt_out.video.i_height,
++ cma_vcsm_init_str(vcsm_type), vcsm_size, extra_buffers);
++ }
++#endif
++
++ if( vcsm_type == VCSM_INIT_NONE )
++ return VLC_EGENERIC;
++#if 1
++ if( (p_dec->fmt_in.i_codec != VLC_CODEC_HEVC &&
++ (vcsm_type == VCSM_INIT_CMA || vcsm_size < (96 << 20))) ||
++ (p_dec->fmt_in.i_codec == VLC_CODEC_HEVC &&
++ vcsm_size < (128 << 20)))
++ {
++ cma_vcsm_exit(vcsm_type);
++ return VLC_EGENERIC;
++ }
++#endif
++
++ AVCodecContext *p_context = ZcFfmpeg_AllocContext( p_dec, &p_codec );
++ if( p_context == NULL )
++ {
++ cma_vcsm_exit(vcsm_type);
++ return VLC_EGENERIC;
++ }
++
++ int i_val;
++
++ /* Allocate the memory needed to store the decoder's structure */
++ decoder_sys_t *p_sys = calloc( 1, sizeof(*p_sys) );
++ if( unlikely(p_sys == NULL) )
++ {
++ avcodec_free_context( &p_context );
++ cma_vcsm_exit(vcsm_type);
++ return VLC_ENOMEM;
++ }
++
++ p_dec->p_sys = p_sys;
++ p_sys->p_context = p_context;
++ p_sys->p_codec = p_codec;
++ p_sys->p_dec = p_dec;
++// p_sys->p_va = NULL;
++ p_sys->cma_in_flight_max = extra_buffers;
++ p_sys->vcsm_init_type = vcsm_type;
++ vlc_sem_init( &p_sys->sem_mt, 0 );
++
++ /* ***** Fill p_context with init values ***** */
++ p_context->codec_tag = ffmpeg_CodecTag( p_dec->fmt_in.i_original_fourcc ?
++ p_dec->fmt_in.i_original_fourcc : p_dec->fmt_in.i_codec );
++
++ /* ***** Get configuration of ffmpeg plugin ***** */
++ p_context->workaround_bugs =
++ var_InheritInteger( p_dec, "avcodec-workaround-bugs" );
++ p_context->err_recognition =
++ var_InheritInteger( p_dec, "avcodec-error-resilience" );
++
++ if( var_CreateGetBool( p_dec, "grayscale" ) )
++ p_context->flags |= AV_CODEC_FLAG_GRAY;
++
++ /* ***** Output always the frames ***** */
++ p_context->flags |= AV_CODEC_FLAG_OUTPUT_CORRUPT;
++
++ i_val = var_CreateGetInteger( p_dec, "avcodec-skiploopfilter" );
++ if( i_val >= 4 ) p_context->skip_loop_filter = AVDISCARD_ALL;
++ else if( i_val == 3 ) p_context->skip_loop_filter = AVDISCARD_NONKEY;
++ else if( i_val == 2 ) p_context->skip_loop_filter = AVDISCARD_BIDIR;
++ else if( i_val == 1 ) p_context->skip_loop_filter = AVDISCARD_NONREF;
++ else p_context->skip_loop_filter = AVDISCARD_DEFAULT;
++
++ if( var_CreateGetBool( p_dec, "avcodec-fast" ) )
++ p_context->flags2 |= AV_CODEC_FLAG2_FAST;
++
++ /* ***** libavcodec frame skipping ***** */
++ p_sys->b_hurry_up = var_CreateGetBool( p_dec, "avcodec-hurry-up" );
++ p_sys->b_show_corrupted = var_CreateGetBool( p_dec, "avcodec-corrupted" );
++
++ i_val = var_CreateGetInteger( p_dec, "avcodec-skip-frame" );
++ if( i_val >= 4 ) p_sys->i_skip_frame = AVDISCARD_ALL;
++ else if( i_val == 3 ) p_sys->i_skip_frame = AVDISCARD_NONKEY;
++ else if( i_val == 2 ) p_sys->i_skip_frame = AVDISCARD_BIDIR;
++ else if( i_val == 1 ) p_sys->i_skip_frame = AVDISCARD_NONREF;
++ else if( i_val == -1 ) p_sys->i_skip_frame = AVDISCARD_NONE;
++ else p_sys->i_skip_frame = AVDISCARD_DEFAULT;
++ p_context->skip_frame = p_sys->i_skip_frame;
++
++ i_val = var_CreateGetInteger( p_dec, "avcodec-skip-idct" );
++ if( i_val >= 4 ) p_context->skip_idct = AVDISCARD_ALL;
++ else if( i_val == 3 ) p_context->skip_idct = AVDISCARD_NONKEY;
++ else if( i_val == 2 ) p_context->skip_idct = AVDISCARD_BIDIR;
++ else if( i_val == 1 ) p_context->skip_idct = AVDISCARD_NONREF;
++ else if( i_val == -1 ) p_context->skip_idct = AVDISCARD_NONE;
++ else p_context->skip_idct = AVDISCARD_DEFAULT;
++
++ /* ***** libavcodec direct rendering ***** */
++ p_sys->b_direct_rendering = false;
++ atomic_init(&p_sys->b_dr_failure, false);
++ if( var_CreateGetBool( p_dec, "avcodec-dr" ) &&
++ (p_codec->capabilities & AV_CODEC_CAP_DR1) &&
++ /* No idea why ... but this fixes flickering on some TSCC streams */
++ p_sys->p_codec->id != AV_CODEC_ID_TSCC &&
++ p_sys->p_codec->id != AV_CODEC_ID_CSCD &&
++ p_sys->p_codec->id != AV_CODEC_ID_CINEPAK )
++ {
++ /* Some codecs set pix_fmt only after the 1st frame has been decoded,
++ * so we need to do another check in ffmpeg_GetFrameBuf() */
++ p_sys->b_direct_rendering = true;
++ }
++
++ p_context->get_format = ZcGetFormat;
++#if 0
++ p_context->get_format = ffmpeg_GetFormat;
++ /* Always use our get_buffer wrapper so we can calculate the
++ * PTS correctly */
++ p_context->get_buffer2 = lavc_GetFrame;
++ p_context->opaque = p_dec;
++#endif
++
++ int i_thread_count = var_InheritInteger( p_dec, "avcodec-threads" );
++ if( i_thread_count <= 0 )
++#if 1
++ {
++ // Pick 5 threads for everything on Pi except for HEVC where the h/w
++ // really limits the useful size to 3
++ i_thread_count = p_codec->id == AV_CODEC_ID_HEVC ? 3 : 5;
++ }
++#else
++ {
++ i_thread_count = vlc_GetCPUCount();
++ if( i_thread_count > 1 )
++ i_thread_count++;
++
++ //FIXME: take in count the decoding time
++#if VLC_WINSTORE_APP
++ i_thread_count = __MIN( i_thread_count, 6 );
++#else
++ i_thread_count = __MIN( i_thread_count, p_codec->id == AV_CODEC_ID_HEVC ? 10 : 6 );
++#endif
++ }
++ i_thread_count = __MIN( i_thread_count, p_codec->id == AV_CODEC_ID_HEVC ? 32 : 16 );
++#endif
++ msg_Dbg( p_dec, "allowing %d thread(s) for decoding", i_thread_count );
++ p_context->thread_count = i_thread_count;
++ p_context->thread_safe_callbacks = true;
++
++ switch( p_codec->id )
++ {
++ case AV_CODEC_ID_MPEG4:
++ case AV_CODEC_ID_H263:
++ p_context->thread_type = 0;
++ break;
++ case AV_CODEC_ID_MPEG1VIDEO:
++ case AV_CODEC_ID_MPEG2VIDEO:
++ p_context->thread_type &= ~FF_THREAD_SLICE;
++ /* fall through */
++# if (LIBAVCODEC_VERSION_INT < AV_VERSION_INT(55, 1, 0))
++ case AV_CODEC_ID_H264:
++ case AV_CODEC_ID_VC1:
++ case AV_CODEC_ID_WMV3:
++ p_context->thread_type &= ~FF_THREAD_FRAME;
++# endif
++ default:
++ break;
++ }
++
++ if( p_context->thread_type & FF_THREAD_FRAME )
++ p_dec->i_extra_picture_buffers = 2 * p_context->thread_count;
++
++ /* ***** misc init ***** */
++ date_Init(&p_sys->pts, 1, 30001);
++ date_Set(&p_sys->pts, VLC_TS_INVALID);
++ p_sys->b_first_frame = true;
++ p_sys->i_late_frames = 0;
++ p_sys->b_from_preroll = false;
++
++ /* Set output properties */
++ if( ZcGetVlcChroma( &p_dec->fmt_out.video, p_context->pix_fmt ) != VLC_SUCCESS )
++ {
++ /* we are doomed. but not really, because most codecs set their pix_fmt later on */
++// p_dec->fmt_out.i_codec = VLC_CODEC_I420;
++ p_dec->fmt_out.i_codec = VLC_CODEC_MMAL_ZC_I420;
++ }
++ p_dec->fmt_out.i_codec = p_dec->fmt_out.video.i_chroma;
++
++ p_dec->fmt_out.video.orientation = p_dec->fmt_in.video.orientation;
++
++ if( p_dec->fmt_in.video.p_palette ) {
++ p_sys->palette_sent = false;
++ p_dec->fmt_out.video.p_palette = malloc( sizeof(video_palette_t) );
++ if( p_dec->fmt_out.video.p_palette )
++ *p_dec->fmt_out.video.p_palette = *p_dec->fmt_in.video.p_palette;
++ } else
++ p_sys->palette_sent = true;
++
++ if ((p_sys->cma_pool = cma_buf_pool_new(p_sys->cma_in_flight_max, p_sys->cma_in_flight_max, false, "mmal_avcodec")) == NULL)
++ {
++ msg_Err(p_dec, "CMA pool alloc failure");
++ goto fail;
++ }
++
++ /* ***** init this codec with special data ***** */
++ ffmpeg_InitCodec( p_dec );
++
++ /* ***** Open the codec ***** */
++ if( OpenVideoCodec( p_dec ) < 0 )
++ {
++ vlc_sem_destroy( &p_sys->sem_mt );
++ free( p_sys );
++ avcodec_free_context( &p_context );
++ return VLC_EGENERIC;
++ }
++
++ p_dec->pf_decode = DecodeVideo;
++ p_dec->pf_flush = Flush;
++
++ /* XXX: Writing input format makes little sense. */
++ if( p_context->profile != FF_PROFILE_UNKNOWN )
++ p_dec->fmt_in.i_profile = p_context->profile;
++ if( p_context->level != FF_LEVEL_UNKNOWN )
++ p_dec->fmt_in.i_level = p_context->level;
++
++#if 1
++ // Most of the time we have nothing useful by way of a format here
++ // wait till we've decoded something
++#else
++ // Update output format
++ if (lavc_UpdateVideoFormat(p_dec, p_context, p_context->pix_fmt,
++ p_context->pix_fmt) != 0)
++ {
++ msg_Err(p_dec, "Unable to update format: pix_fmt=%d", p_context->pix_fmt);
++// goto fail;
++ }
++#endif
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s: OK", __func__);
++#endif
++ return VLC_SUCCESS;
++
++fail:
++ MmalAvcodecCloseDecoder(VLC_OBJECT(p_dec));
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s: FAIL", __func__);
++#endif
++
++ return VLC_EGENERIC;
++}
++
++/*****************************************************************************
++ * Flush:
++ *****************************************************************************/
++static void Flush( decoder_t *p_dec )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *p_context = p_sys->p_context;
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s", __func__);
++#endif
++
++ date_Set(&p_sys->pts, VLC_TS_INVALID); /* To make sure we recover properly */
++ p_sys->i_late_frames = 0;
++ cc_Flush( &p_sys->cc );
++
++ /* Abort pictures in order to unblock all avcodec workers threads waiting
++ * for a picture. This will avoid a deadlock between avcodec_flush_buffers
++ * and workers threads */
++// It would probably be good to use AbortPicture but that often deadlocks on close
++// and given that we wait for pics in the main thread it should be unneeded (whereas
++// cma is alloced in the depths of ffmpeg on its own threads)
++// decoder_AbortPictures( p_dec, true );
++ cma_buf_pool_cancel(p_sys->cma_pool);
++
++ post_mt( p_sys );
++ /* do not flush buffers if codec hasn't been opened (theora/vorbis/VC1) */
++ if( avcodec_is_open( p_context ) )
++ avcodec_flush_buffers( p_context );
++ wait_mt( p_sys );
++
++ /* Reset cancel state to false */
++ cma_buf_pool_uncancel(p_sys->cma_pool);
++// decoder_AbortPictures( p_dec, false );
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s", __func__);
++#endif
++
++}
++
++static bool check_block_validity( decoder_sys_t *p_sys, block_t *block )
++{
++ if( !block)
++ return true;
++
++ if( block->i_flags & (BLOCK_FLAG_DISCONTINUITY|BLOCK_FLAG_CORRUPTED) )
++ {
++ date_Set( &p_sys->pts, VLC_TS_INVALID ); /* To make sure we recover properly */
++ cc_Flush( &p_sys->cc );
++
++ p_sys->i_late_frames = 0;
++ if( block->i_flags & BLOCK_FLAG_CORRUPTED )
++ {
++ block_Release( block );
++ return false;
++ }
++ }
++ return true;
++}
++
++static bool check_block_being_late( decoder_sys_t *p_sys, block_t *block, mtime_t current_time)
++{
++ if( !block )
++ return false;
++ if( block->i_flags & BLOCK_FLAG_PREROLL )
++ {
++ /* Do not care about late frames when prerolling
++ * TODO avoid decoding of non reference frame
++ * (ie all B except for H264 where it depends only on nal_ref_idc) */
++ p_sys->i_late_frames = 0;
++ p_sys->b_from_preroll = true;
++ p_sys->i_last_late_delay = INT64_MAX;
++ }
++
++ if( p_sys->i_late_frames <= 0 )
++ return false;
++
++ if( current_time - p_sys->i_late_frames_start > (5*CLOCK_FREQ))
++ {
++ date_Set( &p_sys->pts, VLC_TS_INVALID ); /* To make sure we recover properly */
++ block_Release( block );
++ p_sys->i_late_frames--;
++ return true;
++ }
++ return false;
++}
++
++static bool check_frame_should_be_dropped( decoder_sys_t *p_sys, AVCodecContext *p_context, bool *b_need_output_picture )
++{
++ if( p_sys->i_late_frames <= 4)
++ return false;
++
++ *b_need_output_picture = false;
++ if( p_sys->i_late_frames < 12 )
++ {
++ p_context->skip_frame =
++ (p_sys->i_skip_frame <= AVDISCARD_NONREF) ?
++ AVDISCARD_NONREF : p_sys->i_skip_frame;
++ }
++ else
++ {
++ /* picture too late, won't decode
++ * but break picture until a new I, and for mpeg4 ...*/
++ p_sys->i_late_frames--; /* needed else it will never be decrease */
++ return true;
++ }
++ return false;
++}
++
++static mtime_t interpolate_next_pts( decoder_t *p_dec, AVFrame *frame )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *p_context = p_sys->p_context;
++
++ if( date_Get( &p_sys->pts ) == VLC_TS_INVALID ||
++ p_sys->pts.i_divider_num == 0 )
++ return VLC_TS_INVALID;
++
++ int i_tick = p_context->ticks_per_frame;
++ if( i_tick <= 0 )
++ i_tick = 1;
++
++ /* interpolate the next PTS */
++ return date_Increment( &p_sys->pts, i_tick + frame->repeat_pict );
++}
++
++static void update_late_frame_count( decoder_t *p_dec, block_t *p_block,
++ mtime_t current_time, mtime_t i_pts,
++ mtime_t i_next_pts )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ /* Update frame late count (except when doing preroll) */
++ mtime_t i_display_date = VLC_TS_INVALID;
++ if( !p_block || !(p_block->i_flags & BLOCK_FLAG_PREROLL) )
++ i_display_date = decoder_GetDisplayDate( p_dec, i_pts );
++
++ mtime_t i_threshold = i_next_pts != VLC_TS_INVALID ? (i_next_pts - i_pts) / 2 : 20000;
++
++ if( i_display_date > VLC_TS_INVALID && i_display_date + i_threshold <= current_time )
++ {
++ /* Out of preroll, consider only late frames on rising delay */
++ if( p_sys->b_from_preroll )
++ {
++ if( p_sys->i_last_late_delay > current_time - i_display_date )
++ {
++ p_sys->i_last_late_delay = current_time - i_display_date;
++ return;
++ }
++ p_sys->b_from_preroll = false;
++ }
++
++ p_sys->i_late_frames++;
++ if( p_sys->i_late_frames == 1 )
++ p_sys->i_late_frames_start = current_time;
++
++ }
++ else
++ {
++ p_sys->i_late_frames = 0;
++ }
++}
++
++
++static int DecodeSidedata( decoder_t *p_dec, const AVFrame *frame, picture_t *p_pic )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ bool format_changed = false;
++
++#if (LIBAVUTIL_VERSION_MICRO >= 100 && LIBAVUTIL_VERSION_INT >= AV_VERSION_INT( 55, 16, 101 ) )
++#define FROM_AVRAT(default_factor, avrat) \
++(uint64_t)(default_factor) * (avrat).num / (avrat).den
++ const AVFrameSideData *metadata =
++ av_frame_get_side_data( frame,
++ AV_FRAME_DATA_MASTERING_DISPLAY_METADATA );
++ if ( metadata )
++ {
++ const AVMasteringDisplayMetadata *hdr_meta =
++ (const AVMasteringDisplayMetadata *) metadata->data;
++ if ( hdr_meta->has_luminance )
++ {
++#define ST2086_LUMA_FACTOR 10000
++ p_pic->format.mastering.max_luminance =
++ FROM_AVRAT(ST2086_LUMA_FACTOR, hdr_meta->max_luminance);
++ p_pic->format.mastering.min_luminance =
++ FROM_AVRAT(ST2086_LUMA_FACTOR, hdr_meta->min_luminance);
++ }
++ if ( hdr_meta->has_primaries )
++ {
++#define ST2086_RED 2
++#define ST2086_GREEN 0
++#define ST2086_BLUE 1
++#define LAV_RED 0
++#define LAV_GREEN 1
++#define LAV_BLUE 2
++#define ST2086_PRIM_FACTOR 50000
++ p_pic->format.mastering.primaries[ST2086_RED*2 + 0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_RED][0]);
++ p_pic->format.mastering.primaries[ST2086_RED*2 + 1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_RED][1]);
++ p_pic->format.mastering.primaries[ST2086_GREEN*2 + 0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_GREEN][0]);
++ p_pic->format.mastering.primaries[ST2086_GREEN*2 + 1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_GREEN][1]);
++ p_pic->format.mastering.primaries[ST2086_BLUE*2 + 0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_BLUE][0]);
++ p_pic->format.mastering.primaries[ST2086_BLUE*2 + 1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_BLUE][1]);
++ p_pic->format.mastering.white_point[0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->white_point[0]);
++ p_pic->format.mastering.white_point[1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->white_point[1]);
++ }
++
++ if ( memcmp( &p_dec->fmt_out.video.mastering,
++ &p_pic->format.mastering,
++ sizeof(p_pic->format.mastering) ) )
++ {
++ p_dec->fmt_out.video.mastering = p_pic->format.mastering;
++ format_changed = true;
++ }
++#undef FROM_AVRAT
++ }
++#endif
++#if (LIBAVUTIL_VERSION_MICRO >= 100 && LIBAVUTIL_VERSION_INT >= AV_VERSION_INT( 55, 60, 100 ) )
++ const AVFrameSideData *metadata_lt =
++ av_frame_get_side_data( frame,
++ AV_FRAME_DATA_CONTENT_LIGHT_LEVEL );
++ if ( metadata_lt )
++ {
++ const AVContentLightMetadata *light_meta =
++ (const AVContentLightMetadata *) metadata_lt->data;
++ p_pic->format.lighting.MaxCLL = light_meta->MaxCLL;
++ p_pic->format.lighting.MaxFALL = light_meta->MaxFALL;
++ if ( memcmp( &p_dec->fmt_out.video.lighting,
++ &p_pic->format.lighting,
++ sizeof(p_pic->format.lighting) ) )
++ {
++ p_dec->fmt_out.video.lighting = p_pic->format.lighting;
++ format_changed = true;
++ }
++ }
++#endif
++
++ if (format_changed && decoder_UpdateVideoFormat( p_dec ))
++ return -1;
++
++ const AVFrameSideData *p_avcc = av_frame_get_side_data( frame, AV_FRAME_DATA_A53_CC );
++ if( p_avcc )
++ {
++ cc_Extract( &p_sys->cc, CC_PAYLOAD_RAW, true, p_avcc->data, p_avcc->size );
++ if( p_sys->cc.b_reorder || p_sys->cc.i_data )
++ {
++ block_t *p_cc = block_Alloc( p_sys->cc.i_data );
++ if( p_cc )
++ {
++ memcpy( p_cc->p_buffer, p_sys->cc.p_data, p_sys->cc.i_data );
++ if( p_sys->cc.b_reorder )
++ p_cc->i_dts = p_cc->i_pts = p_pic->date;
++ else
++ p_cc->i_pts = p_cc->i_dts;
++ decoder_cc_desc_t desc;
++ desc.i_608_channels = p_sys->cc.i_608channels;
++ desc.i_708_channels = p_sys->cc.i_708channels;
++ desc.i_reorder_depth = 4;
++ decoder_QueueCc( p_dec, p_cc, &desc );
++ }
++ cc_Flush( &p_sys->cc );
++ }
++ }
++ return 0;
++}
++
++/*****************************************************************************
++ * DecodeBlock: Called to decode one or more frames
++ *****************************************************************************/
++
++static picture_t *DecodeBlock( decoder_t *p_dec, block_t **pp_block, bool *error )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *p_context = p_sys->p_context;
++ /* Boolean if we assume that we should get valid pic as result */
++ bool b_need_output_picture = true;
++
++ /* Boolean for END_OF_SEQUENCE */
++ bool eos_spotted = false;
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s: (buf_size=%d)", __func__, pp_block == NULL || *pp_block == NULL ? 0 : (*pp_block)->i_buffer);
++#endif
++
++ block_t *p_block;
++ mtime_t current_time;
++ picture_t *p_pic = NULL;
++ AVFrame *frame = NULL;
++
++ // By default we are OK
++ *error = false;
++
++ if( !p_context->extradata_size && p_dec->fmt_in.i_extra )
++ {
++ ffmpeg_InitCodec( p_dec );
++ if( !avcodec_is_open( p_context ) )
++ OpenVideoCodec( p_dec );
++ }
++
++ p_block = pp_block ? *pp_block : NULL;
++ if(!p_block && !(p_sys->p_codec->capabilities & AV_CODEC_CAP_DELAY) )
++ return NULL;
++
++ if( !avcodec_is_open( p_context ) )
++ {
++ if( p_block )
++ block_Release( p_block );
++ return NULL;
++ }
++
++ if( !check_block_validity( p_sys, p_block ) )
++ return NULL;
++
++ current_time = mdate();
++ if( p_dec->b_frame_drop_allowed && check_block_being_late( p_sys, p_block, current_time) )
++ {
++ msg_Err( p_dec, "more than 5 seconds of late video -> "
++ "dropping frame (computer too slow ?)" );
++ return NULL;
++ }
++
++
++ /* A good idea could be to decode all I pictures and see for the other */
++
++ /* Defaults that if we aren't in prerolling, we want output picture
++ same for if we are flushing (p_block==NULL) */
++ if( !p_block || !(p_block->i_flags & BLOCK_FLAG_PREROLL) )
++ b_need_output_picture = true;
++ else
++ b_need_output_picture = false;
++
++ /* Change skip_frame config only if hurry_up is enabled */
++ if( p_sys->b_hurry_up )
++ {
++ p_context->skip_frame = p_sys->i_skip_frame;
++
++ /* Check also if we should/can drop the block and move to next block
++ as trying to catchup the speed*/
++ if( p_dec->b_frame_drop_allowed &&
++ check_frame_should_be_dropped( p_sys, p_context, &b_need_output_picture ) )
++ {
++ if( p_block )
++ block_Release( p_block );
++ msg_Warn( p_dec, "More than 11 late frames, dropping frame" );
++ return NULL;
++ }
++ }
++ if( !b_need_output_picture )
++ {
++ p_context->skip_frame = __MAX( p_context->skip_frame,
++ AVDISCARD_NONREF );
++ }
++
++ /*
++ * Do the actual decoding now */
++
++ /* Don't forget that libavcodec requires a little more bytes
++ * that the real frame size */
++ if( p_block && p_block->i_buffer > 0 )
++ {
++ eos_spotted = ( p_block->i_flags & BLOCK_FLAG_END_OF_SEQUENCE ) != 0;
++
++ p_block = block_Realloc( p_block, 0,
++ p_block->i_buffer + FF_INPUT_BUFFER_PADDING_SIZE );
++ if( !p_block )
++ return NULL;
++ p_block->i_buffer -= FF_INPUT_BUFFER_PADDING_SIZE;
++ *pp_block = p_block;
++ memset( p_block->p_buffer + p_block->i_buffer, 0,
++ FF_INPUT_BUFFER_PADDING_SIZE );
++ }
++
++ while( !p_block || p_block->i_buffer > 0 || eos_spotted )
++ {
++ int i_used;
++ AVPacket pkt;
++
++ post_mt( p_sys );
++
++ av_init_packet( &pkt );
++ if( p_block && p_block->i_buffer > 0 )
++ {
++ pkt.data = p_block->p_buffer;
++ pkt.size = p_block->i_buffer;
++ pkt.pts = p_block->i_pts > VLC_TS_INVALID ? p_block->i_pts : AV_NOPTS_VALUE;
++ pkt.dts = p_block->i_dts > VLC_TS_INVALID ? p_block->i_dts : AV_NOPTS_VALUE;
++ }
++ else
++ {
++ /* Return delayed frames if codec has CODEC_CAP_DELAY */
++ pkt.data = NULL;
++ pkt.size = 0;
++ }
++
++ if( !p_sys->palette_sent )
++ {
++ uint8_t *pal = av_packet_new_side_data(&pkt, AV_PKT_DATA_PALETTE, AVPALETTE_SIZE);
++ if (pal) {
++ memcpy(pal, p_dec->fmt_in.video.p_palette->palette, AVPALETTE_SIZE);
++ p_sys->palette_sent = true;
++ }
++ }
++
++ /* Make sure we don't reuse the same timestamps twice */
++ if( p_block )
++ {
++ p_block->i_pts =
++ p_block->i_dts = VLC_TS_INVALID;
++ }
++
++ int ret = avcodec_send_packet(p_context, &pkt);
++ if( ret != 0 && ret != AVERROR(EAGAIN) )
++ {
++ if (ret == AVERROR(ENOMEM) || ret == AVERROR(EINVAL))
++ {
++ msg_Err(p_dec, "avcodec_send_packet critical error");
++ *error = true;
++ }
++ av_packet_unref( &pkt );
++ break;
++ }
++ i_used = ret != AVERROR(EAGAIN) ? pkt.size : 0;
++ av_packet_unref( &pkt );
++
++ frame = av_frame_alloc();
++ if (unlikely(frame == NULL))
++ {
++ *error = true;
++ break;
++ }
++
++ ret = avcodec_receive_frame(p_context, frame);
++ if( ret != 0 && ret != AVERROR(EAGAIN) )
++ {
++ msg_Dbg(p_dec, "No receive");
++ if (ret == AVERROR(ENOMEM) || ret == AVERROR(EINVAL))
++ {
++ msg_Err(p_dec, "avcodec_receive_frame critical error");
++ *error = true;
++ }
++ av_frame_free(&frame);
++ /* After draining, we need to reset decoder with a flush */
++ if( ret == AVERROR_EOF )
++ avcodec_flush_buffers( p_sys->p_context );
++ break;
++ }
++ bool not_received_frame = ret;
++
++ wait_mt( p_sys );
++
++ if( eos_spotted )
++ p_sys->b_first_frame = true;
++
++ if( p_block )
++ {
++ if( p_block->i_buffer <= 0 )
++ eos_spotted = false;
++
++ /* Consumed bytes */
++ p_block->p_buffer += i_used;
++ p_block->i_buffer -= i_used;
++ }
++
++ /* Nothing to display */
++ if( not_received_frame )
++ {
++// msg_Dbg(p_dec, "No rx: used=%d", i_used);
++ av_frame_free(&frame);
++ if( i_used == 0 ) break;
++ continue;
++ }
++
++ /* Compute the PTS */
++#ifdef FF_API_PKT_PTS
++ mtime_t i_pts = frame->pts;
++#else
++ mtime_t i_pts = frame->pkt_pts;
++#endif
++ if (i_pts == AV_NOPTS_VALUE )
++ i_pts = frame->pkt_dts;
++
++ if( i_pts == AV_NOPTS_VALUE )
++ i_pts = date_Get( &p_sys->pts );
++
++ /* Interpolate the next PTS */
++ if( i_pts > VLC_TS_INVALID )
++ date_Set( &p_sys->pts, i_pts );
++
++ const mtime_t i_next_pts = interpolate_next_pts(p_dec, frame);
++
++ update_late_frame_count( p_dec, p_block, current_time, i_pts, i_next_pts);
++
++ if( !b_need_output_picture ||
++// ( !p_sys->p_va && !frame->linesize[0] ) ||
++ ( !frame->linesize[0] ) ||
++ ( p_dec->b_frame_drop_allowed && (frame->flags & AV_FRAME_FLAG_CORRUPT) &&
++ !p_sys->b_show_corrupted ) )
++ {
++ av_frame_free(&frame);
++// msg_Dbg(p_dec, "Bad frame");
++ continue;
++ }
++
++ if( p_context->pix_fmt == AV_PIX_FMT_PAL8
++ && !p_dec->fmt_out.video.p_palette )
++ {
++ /* See AV_PIX_FMT_PAL8 comment in avc_GetVideoFormat(): update the
++ * fmt_out palette and change the fmt_out chroma to request a new
++ * vout */
++ assert( p_dec->fmt_out.video.i_chroma != VLC_CODEC_RGBP );
++
++ video_palette_t *p_palette;
++ p_palette = p_dec->fmt_out.video.p_palette
++ = malloc( sizeof(video_palette_t) );
++ if( !p_palette )
++ {
++ *error = true;
++ av_frame_free(&frame);
++ break;
++ }
++ static_assert( sizeof(p_palette->palette) == AVPALETTE_SIZE,
++ "Palette size mismatch between vlc and libavutil" );
++ assert( frame->data[1] != NULL );
++ memcpy( p_palette->palette, frame->data[1], AVPALETTE_SIZE );
++ p_palette->i_entries = AVPALETTE_COUNT;
++ p_dec->fmt_out.video.i_chroma = VLC_CODEC_RGBP;
++ if( decoder_UpdateVideoFormat( p_dec ) )
++ {
++ av_frame_free(&frame);
++ continue;
++ }
++ }
++
++#if 1
++ {
++ cma_buf_t * const cb = av_rpi_zc_buf_v(frame->buf[0]);
++
++ if (cb == NULL)
++ {
++ msg_Err(p_dec, "Frame has no attached CMA buffer");
++ goto fail;
++ }
++
++ if (lavc_UpdateVideoFormat(p_dec, p_context, p_context->pix_fmt,
++ p_context->pix_fmt) != 0)
++ {
++ msg_Err(p_dec, "Failed to update format");
++ goto fail;
++ }
++
++ if ((p_pic = decoder_NewPicture(p_dec)) == NULL)
++ {
++ msg_Err(p_dec, "Failed to allocate pic");
++ goto fail;
++ }
++
++ if (cma_buf_pic_attach(cma_buf_ref(cb), p_pic) != 0)
++ {
++ cma_buf_unref(cb); // Undo the in_flight
++ char dbuf0[5];
++ msg_Err(p_dec, "Failed to attach bufs to pic: fmt=%s", str_fourcc(dbuf0, p_pic->format.i_chroma));
++ goto fail;
++ }
++
++ // ****** Set planes etc.
++ set_pic_from_frame(p_pic, frame);
++ }
++#else
++ picture_t *p_pic = frame->opaque;
++ if( p_pic == NULL )
++ { /* When direct rendering is not used, get_format() and get_buffer()
++ * might not be called. The output video format must be set here
++ * then picture buffer can be allocated. */
++ if (p_sys->p_va == NULL
++ && lavc_UpdateVideoFormat(p_dec, p_context, p_context->pix_fmt,
++ p_context->pix_fmt) == 0)
++ p_pic = decoder_NewPicture(p_dec);
++
++ if( !p_pic )
++ {
++ av_frame_free(&frame);
++ break;
++ }
++
++ /* Fill picture_t from AVFrame */
++ if( lavc_CopyPicture( p_dec, p_pic, frame ) != VLC_SUCCESS )
++ {
++ av_frame_free(&frame);
++ picture_Release( p_pic );
++ break;
++ }
++ }
++ else
++ {
++ /* Some codecs can return the same frame multiple times. By the
++ * time that the same frame is returned a second time, it will be
++ * too late to clone the underlying picture. So clone proactively.
++ * A single picture CANNOT be queued multiple times.
++ */
++ p_pic = picture_Clone( p_pic );
++ if( unlikely(p_pic == NULL) )
++ {
++ av_frame_free(&frame);
++ break;
++ }
++ }
++#endif
++
++ if( !p_dec->fmt_in.video.i_sar_num || !p_dec->fmt_in.video.i_sar_den )
++ {
++ /* Fetch again the aspect ratio in case it changed */
++ p_dec->fmt_out.video.i_sar_num
++ = p_context->sample_aspect_ratio.num;
++ p_dec->fmt_out.video.i_sar_den
++ = p_context->sample_aspect_ratio.den;
++
++ if( !p_dec->fmt_out.video.i_sar_num || !p_dec->fmt_out.video.i_sar_den )
++ {
++ p_dec->fmt_out.video.i_sar_num = 1;
++ p_dec->fmt_out.video.i_sar_den = 1;
++ }
++ }
++
++ p_pic->date = i_pts;
++ /* Hack to force display of still pictures */
++ p_pic->b_force = p_sys->b_first_frame;
++ p_pic->i_nb_fields = 2 + frame->repeat_pict;
++ p_pic->b_progressive = !frame->interlaced_frame;
++ p_pic->b_top_field_first = frame->top_field_first;
++
++ if (DecodeSidedata(p_dec, frame, p_pic))
++ i_pts = VLC_TS_INVALID;
++
++ av_frame_free(&frame);
++
++ /* Send decoded frame to vout */
++ if (i_pts > VLC_TS_INVALID)
++ {
++ p_sys->b_first_frame = false;
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s: Got pic", __func__);
++#endif
++ return p_pic;
++ }
++ else
++ picture_Release( p_pic );
++ }
++
++ if( p_block )
++ block_Release( p_block );
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s: NULL", __func__);
++#endif
++ return NULL;
++
++fail:
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s: FAIL", __func__);
++#endif
++ av_frame_free(&frame);
++ if (p_pic != NULL)
++ picture_Release(p_pic);
++ if (p_block != NULL)
++ block_Release(p_block);
++ *error = true;
++ return NULL;
++}
++
++static int DecodeVideo( decoder_t *p_dec, block_t *p_block )
++{
++ block_t **pp_block = p_block ? &p_block : NULL;
++ picture_t *p_pic;
++ bool error = false;
++ while( ( p_pic = DecodeBlock( p_dec, pp_block, &error ) ) != NULL )
++ decoder_QueueVideo( p_dec, p_pic );
++ return VLCDEC_SUCCESS;
++// Easiest to just ignore all errors - returning a real error seems to
++// kill output forever
++// return error ? VLCDEC_ECRITICAL : VLCDEC_SUCCESS;
++}
++
++/*****************************************************************************
++ * EndVideo: decoder destruction
++ *****************************************************************************
++ * This function is called when the thread ends after a successful
++ * initialization.
++ *****************************************************************************/
++static void MmalAvcodecCloseDecoder( vlc_object_t *obj )
++{
++ decoder_t *p_dec = (decoder_t *)obj;
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *ctx = p_sys->p_context;
++// void *hwaccel_context;
++
++ msg_Dbg(obj, "<<< %s", __func__);
++
++ post_mt( p_sys );
++
++ cma_buf_pool_cancel(p_sys->cma_pool); // Abort any pending frame allocs
++
++ /* do not flush buffers if codec hasn't been opened (theora/vorbis/VC1) */
++ if( avcodec_is_open( ctx ) )
++ avcodec_flush_buffers( ctx );
++
++ av_rpi_zc_uninit2(ctx);
++
++ wait_mt( p_sys );
++
++ cc_Flush( &p_sys->cc );
++
++// hwaccel_context = ctx->hwaccel_context;
++ avcodec_free_context( &ctx );
++
++// if( p_sys->p_va )
++// vlc_va_Delete( p_sys->p_va, &hwaccel_context );
++
++ cma_vcsm_exit(p_sys->vcsm_init_type);
++
++ vlc_sem_destroy( &p_sys->sem_mt );
++ free( p_sys );
++}
++
++/*****************************************************************************
++ * ffmpeg_InitCodec: setup codec extra initialization data for ffmpeg
++ *****************************************************************************/
++static void ffmpeg_InitCodec( decoder_t *p_dec )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ size_t i_size = p_dec->fmt_in.i_extra;
++
++ if( !i_size ) return;
++
++ if( p_sys->p_codec->id == AV_CODEC_ID_SVQ3 )
++ {
++ uint8_t *p;
++
++ p_sys->p_context->extradata_size = i_size + 12;
++ p = p_sys->p_context->extradata =
++ av_malloc( p_sys->p_context->extradata_size +
++ FF_INPUT_BUFFER_PADDING_SIZE );
++ if( !p )
++ return;
++
++ memcpy( &p[0], "SVQ3", 4 );
++ memset( &p[4], 0, 8 );
++ memcpy( &p[12], p_dec->fmt_in.p_extra, i_size );
++
++ /* Now remove all atoms before the SMI one */
++ if( p_sys->p_context->extradata_size > 0x5a &&
++ strncmp( (char*)&p[0x56], "SMI ", 4 ) )
++ {
++ uint8_t *psz = &p[0x52];
++
++ while( psz < &p[p_sys->p_context->extradata_size - 8] )
++ {
++ uint_fast32_t atom_size = GetDWBE( psz );
++ if( atom_size <= 1 )
++ {
++ /* FIXME handle 1 as long size */
++ break;
++ }
++ if( !strncmp( (char*)&psz[4], "SMI ", 4 ) )
++ {
++ memmove( &p[0x52], psz,
++ &p[p_sys->p_context->extradata_size] - psz );
++ break;
++ }
++
++ psz += atom_size;
++ }
++ }
++ }
++ else
++ {
++ p_sys->p_context->extradata_size = i_size;
++ p_sys->p_context->extradata =
++ av_malloc( i_size + FF_INPUT_BUFFER_PADDING_SIZE );
++ if( p_sys->p_context->extradata )
++ {
++ memcpy( p_sys->p_context->extradata,
++ p_dec->fmt_in.p_extra, i_size );
++ memset( p_sys->p_context->extradata + i_size,
++ 0, FF_INPUT_BUFFER_PADDING_SIZE );
++ }
++ }
++}
++
++
++vlc_module_begin()
++ set_category( CAT_INPUT )
++ set_subcategory( SUBCAT_INPUT_VCODEC )
++ set_shortname(N_("MMAL avcodec"))
++ set_description(N_("MMAL buffered avcodec "))
++ set_capability("video decoder", 80)
++ add_shortcut("mmal_avcodec")
++ add_integer(MMAL_AVCODEC_BUFFERS, -1, MMAL_AVCODEC_BUFFERS_TEXT,
++ MMAL_AVCODEC_BUFFERS_LONGTEXT, true)
++ set_callbacks(MmalAvcodecOpenDecoder, MmalAvcodecCloseDecoder)
++vlc_module_end()
++
+--- /dev/null
++++ b/modules/hw/mmal/mmal_cma.c
+@@ -0,0 +1,668 @@
++#ifdef HAVE_CONFIG_H
++# include "config.h"
++#endif
++
++#include <stdatomic.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include <vlc_common.h>
++#include <vlc_picture.h>
++
++#include "mmal_cma.h"
++#include "mmal_picture.h"
++
++#include <assert.h>
++
++#define TRACE_ALL 0
++
++//-----------------------------------------------------------------------------
++//
++// Generic pool functions
++// Knows nothing about pool entries
++
++typedef void * cma_pool_alloc_fn(void * v, size_t size);
++typedef void cma_pool_free_fn(void * v, void * el, size_t size);
++
++#if TRACE_ALL
++static atomic_int pool_seq;
++#endif
++
++// Pool structure
++// Ref count is held by pool owner and pool els that have been got
++// Els in the pool do not count towards its ref count
++struct cma_pool_fixed_s
++{
++ atomic_int ref_count;
++
++ vlc_mutex_t lock;
++ unsigned int n_in;
++ unsigned int n_out;
++ unsigned int pool_size;
++ int flight_size;
++ size_t el_size;
++ void ** pool;
++
++ bool cancel;
++ int in_flight;
++ vlc_cond_t flight_cond;
++
++ void * alloc_v;
++ cma_pool_alloc_fn * el_alloc_fn;
++ cma_pool_free_fn * el_free_fn;
++ cma_pool_on_delete_fn * on_delete_fn;
++
++ const char * name;
++#if TRACE_ALL
++ int seq;
++#endif
++};
++
++static inline unsigned int inc_mod(const unsigned int n, const unsigned int m)
++{
++ return n + 1 >= m ? 0 : n + 1;
++}
++
++static void free_pool(const cma_pool_fixed_t * const p, void ** const pool,
++ const unsigned int pool_size, const size_t el_size)
++{
++ if (pool == NULL)
++ return;
++
++ for (unsigned int n = 0; n != pool_size; ++n)
++ if (pool[n] != NULL)
++ p->el_free_fn(p->alloc_v, pool[n], el_size);
++ free(pool);
++}
++
++// Just kill this - no checks
++static void cma_pool_fixed_delete(cma_pool_fixed_t * const p)
++{
++ cma_pool_on_delete_fn *const on_delete_fn = p->on_delete_fn;
++ void *const v = p->alloc_v;
++
++ free_pool(p, p->pool, p->pool_size, p->el_size);
++
++ if (p->name != NULL)
++ free((void *)p->name); // Discard const
++
++ vlc_cond_destroy(&p->flight_cond);
++ vlc_mutex_destroy(&p->lock);
++ free(p);
++
++ // Inform our container that we are dead (if it cares)
++ if (on_delete_fn)
++ on_delete_fn(v);
++}
++
++static void cma_pool_fixed_unref(cma_pool_fixed_t * const p)
++{
++ if (atomic_fetch_sub(&p->ref_count, 1) <= 1)
++ cma_pool_fixed_delete(p);
++}
++
++static void cma_pool_fixed_ref(cma_pool_fixed_t * const p)
++{
++ atomic_fetch_add(&p->ref_count, 1);
++}
++
++static void cma_pool_fixed_inc_in_flight(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ ++p->in_flight;
++ vlc_mutex_unlock(&p->lock);
++}
++
++static void cma_pool_fixed_dec_in_flight(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ if (--p->in_flight == 0)
++ vlc_cond_signal(&p->flight_cond);
++ vlc_mutex_unlock(&p->lock);
++}
++
++static void * cma_pool_fixed_get(cma_pool_fixed_t * const p, const size_t req_el_size, const bool inc_flight, const bool no_pool)
++{
++ void * v = NULL;
++
++ vlc_mutex_lock(&p->lock);
++
++ for (;;)
++ {
++ if (req_el_size != p->el_size)
++ {
++ void ** const deadpool = p->pool;
++ const size_t dead_size = p->el_size;
++ const unsigned int dead_n = p->pool_size;
++
++ p->pool = NULL;
++ p->n_in = 0;
++ p->n_out = 0;
++ p->el_size = req_el_size;
++
++ if (deadpool != NULL)
++ {
++ vlc_mutex_unlock(&p->lock);
++ // Do the free old op outside the mutex in case the free is slow
++ free_pool(p, deadpool, dead_n, dead_size);
++ vlc_mutex_lock(&p->lock);
++ continue;
++ }
++ }
++
++ // Late abort if flush or cancel so we can still kill the pool
++ if (req_el_size == 0 || p->cancel)
++ {
++ vlc_mutex_unlock(&p->lock);
++ return NULL;
++ }
++
++ if (p->pool != NULL && !no_pool)
++ {
++ v = p->pool[p->n_in];
++ if (v != NULL)
++ {
++ p->pool[p->n_in] = NULL;
++ p->n_in = inc_mod(p->n_in, p->pool_size);
++ break;
++ }
++ }
++
++ if (p->in_flight <= 0)
++ break;
++
++ vlc_cond_wait(&p->flight_cond, &p->lock);
++ }
++
++ if (inc_flight)
++ ++p->in_flight;
++
++ vlc_mutex_unlock(&p->lock);
++
++ if (v == NULL && req_el_size != 0)
++ v = p->el_alloc_fn(p->alloc_v, req_el_size);
++
++ // Tag ref
++ if (v != NULL)
++ cma_pool_fixed_ref(p);
++ // Remove flight if we set it and error
++ else if (inc_flight)
++ cma_pool_fixed_dec_in_flight(p);
++
++ return v;
++}
++
++static void cma_pool_fixed_put(cma_pool_fixed_t * const p, void * v, const size_t el_size, const bool was_in_flight)
++{
++ vlc_mutex_lock(&p->lock);
++
++ if (el_size == p->el_size && (p->pool == NULL || p->pool[p->n_out] == NULL))
++ {
++ if (p->pool == NULL)
++ p->pool = calloc(p->pool_size, sizeof(void*));
++
++ p->pool[p->n_out] = v;
++ p->n_out = inc_mod(p->n_out, p->pool_size);
++ v = NULL;
++ }
++
++ if (was_in_flight)
++ --p->in_flight;
++
++ vlc_mutex_unlock(&p->lock);
++
++ vlc_cond_signal(&p->flight_cond);
++
++ if (v != NULL)
++ p->el_free_fn(p->alloc_v, v, el_size);
++
++ cma_pool_fixed_unref(p);
++}
++
++static int cma_pool_fixed_resize(cma_pool_fixed_t * const p,
++ const unsigned int new_pool_size, const int new_flight_size)
++{
++ void ** dead_pool = NULL;
++ size_t dead_size = 0;
++ unsigned int dead_n = 0;
++
++ // This makes this non-reentrant but saves us a lot of time in the normal
++ // "nothing happens" case
++ if (p->pool_size == new_pool_size && p->flight_size == new_flight_size)
++ return 0;
++
++ vlc_mutex_lock(&p->lock);
++
++ if (p->pool != NULL && new_pool_size != p->pool_size)
++ {
++ void ** const new_pool = calloc(new_pool_size, sizeof(void*));
++ unsigned int d, s;
++ dead_pool = p->pool;
++ dead_size = p->el_size;
++ dead_n = p->pool_size;
++
++ if (new_pool == NULL)
++ {
++ vlc_mutex_unlock(&p->lock);
++ return -1;
++ }
++
++ for (d = 0, s = p->n_in; d != new_pool_size && (new_pool[d] = dead_pool[s]) != NULL; ++d, s = inc_mod(s, dead_n))
++ dead_pool[s] = NULL;
++
++ p->n_out = 0;
++ p->n_in = (d != new_pool_size) ? d : 0;
++ p->pool = new_pool;
++ }
++
++ p->pool_size = new_pool_size;
++ if (new_flight_size > p->flight_size)
++ vlc_cond_broadcast(&p->flight_cond); // Lock still active so nothing happens till we release it
++ p->in_flight += p->flight_size - new_flight_size;
++ p->flight_size = new_flight_size;
++
++ vlc_mutex_unlock(&p->lock);
++
++ free_pool(p, dead_pool, dead_n, dead_size);
++ return 0;
++}
++
++static int cma_pool_fixed_fill(cma_pool_fixed_t * const p, const size_t el_size)
++{
++ for (;;)
++ {
++ vlc_mutex_lock(&p->lock);
++ bool done = el_size == p->el_size && p->pool != NULL && p->pool[p->n_out] != NULL;
++ vlc_mutex_unlock(&p->lock);
++ if (done)
++ break;
++ void * buf = cma_pool_fixed_get(p, el_size, false, true);
++ if (buf == NULL)
++ return -ENOMEM;
++ cma_pool_fixed_put(p, buf, el_size, false);
++ }
++ return 0;
++}
++
++static void cma_pool_fixed_cancel(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ p->cancel = true;
++ vlc_cond_broadcast(&p->flight_cond);
++ vlc_mutex_unlock(&p->lock);
++}
++
++static void cma_pool_fixed_uncancel(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ p->cancel = false;
++ vlc_mutex_unlock(&p->lock);
++}
++
++
++// Purge pool & unref
++static void cma_pool_fixed_kill(cma_pool_fixed_t * const p)
++{
++ if (p == NULL)
++ return;
++
++ // This flush is not strictly needed but it reclaims what memory we can reclaim asap
++ cma_pool_fixed_get(p, 0, false, false);
++ cma_pool_fixed_unref(p);
++}
++
++// Create a new pool
++static cma_pool_fixed_t*
++cma_pool_fixed_new(const unsigned int pool_size,
++ const int flight_size,
++ void * const alloc_v,
++ cma_pool_alloc_fn * const alloc_fn, cma_pool_free_fn * const free_fn,
++ cma_pool_on_delete_fn * const on_delete_fn,
++ const char * const name)
++{
++ cma_pool_fixed_t* const p = calloc(1, sizeof(cma_pool_fixed_t));
++ if (p == NULL)
++ return NULL;
++
++ atomic_store(&p->ref_count, 1);
++ vlc_mutex_init(&p->lock);
++ vlc_cond_init(&p->flight_cond);
++
++ p->pool_size = pool_size;
++ p->flight_size = flight_size;
++ p->in_flight = -flight_size;
++
++ p->alloc_v = alloc_v;
++ p->el_alloc_fn = alloc_fn;
++ p->el_free_fn = free_fn;
++ p->on_delete_fn = on_delete_fn;
++ p->name = name == NULL ? NULL : strdup(name);
++#if TRACE_ALL
++ p->seq = atomic_fetch_add(&pool_seq, 1);
++#endif
++
++ return p;
++}
++
++// ---------------------------------------------------------------------------
++//
++// CMA buffer functions - uses cma_pool_fixed for pooling
++
++struct cma_buf_pool_s {
++ cma_pool_fixed_t * pool;
++ vcsm_init_type_t init_type;
++
++ bool all_in_flight;
++#if TRACE_ALL
++ size_t alloc_n;
++ size_t alloc_size;
++#endif
++};
++
++typedef struct cma_buf_s {
++ atomic_int ref_count;
++ cma_buf_pool_t * cbp;
++ bool in_flight;
++ size_t size;
++ unsigned int vcsm_h; // VCSM handle from initial alloc
++ unsigned int vc_h; // VC handle for ZC mmal buffers
++ unsigned int vc_addr; // VC addr - unused by us but wanted by FFmpeg
++ int fd; // dmabuf handle for GL
++ void * mmap; // ARM mapped address
++ picture_context_t *ctx2;
++} cma_buf_t;
++
++static void cma_pool_delete(cma_buf_t * const cb)
++{
++ assert(atomic_load(&cb->ref_count) == 0);
++#if TRACE_ALL
++ cb->cbp->alloc_size -= cb->size;
++ --cb->cbp->alloc_n;
++ fprintf(stderr, "%s[%d:%s]: N=%d, Total=%d\n", __func__, cb->cbp->pool->seq, cb->cbp->pool->name, cb->cbp->alloc_n, cb->cbp->alloc_size);
++#endif
++
++ if (cb->ctx2 != NULL)
++ cb->ctx2->destroy(cb->ctx2);
++
++ if (cb->mmap != MAP_FAILED)
++ {
++ if (cb->cbp->init_type == VCSM_INIT_CMA)
++ munmap(cb->mmap, cb->size);
++ else
++ vcsm_unlock_hdl(cb->vcsm_h);
++ }
++ if (cb->fd != -1)
++ close(cb->fd);
++ if (cb->vcsm_h != 0)
++ vcsm_free(cb->vcsm_h);
++ free(cb);
++}
++
++static void cma_pool_free_cb(void * v, void * el, size_t size)
++{
++ VLC_UNUSED(v);
++ VLC_UNUSED(size);
++
++ cma_pool_delete(el);
++}
++
++static void * cma_pool_alloc_cb(void * v, size_t size)
++{
++ cma_buf_pool_t * const cbp = v;
++
++ cma_buf_t * const cb = malloc(sizeof(cma_buf_t));
++ if (cb == NULL)
++ return NULL;
++
++ *cb = (cma_buf_t){
++ .ref_count = ATOMIC_VAR_INIT(0),
++ .cbp = cbp,
++ .in_flight = 0,
++ .size = size,
++ .vcsm_h = 0,
++ .vc_h = 0,
++ .fd = -1,
++ .mmap = MAP_FAILED,
++ .ctx2 = NULL
++ };
++#if TRACE_ALL
++ cb->cbp->alloc_size += cb->size;
++ ++cb->cbp->alloc_n;
++ fprintf(stderr, "%s[%d:%s]: N=%d, Total=%d\n", __func__, cbp->pool->seq, cbp->pool->name, cbp->alloc_n, cbp->alloc_size);
++#endif
++
++ // 0x80 is magic value to force full ARM-side mapping - otherwise
++ // cache requests can cause kernel crashes
++ if ((cb->vcsm_h = vcsm_malloc_cache(size, VCSM_CACHE_TYPE_HOST | 0x80, "VLC frame")) == 0)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_malloc_cache fail\n");
++#endif
++ goto fail;
++ }
++
++ if ((cb->vc_h = vcsm_vc_hdl_from_hdl(cb->vcsm_h)) == 0)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_vc_hdl_from_hdl fail\n");
++#endif
++ goto fail;
++ }
++
++ if (cbp->init_type == VCSM_INIT_CMA)
++ {
++ if ((cb->fd = vcsm_export_dmabuf(cb->vcsm_h)) == -1)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_export_dmabuf fail\n");
++#endif
++ goto fail;
++ }
++
++ if ((cb->mmap = mmap(NULL, cb->size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, cb->fd, 0)) == MAP_FAILED)
++ goto fail;
++ }
++ else
++ {
++ void * arm_addr;
++ if ((arm_addr = vcsm_lock(cb->vcsm_h)) == NULL)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_lock fail\n");
++#endif
++ goto fail;
++ }
++ cb->mmap = arm_addr;
++ }
++
++ cb->vc_addr = vcsm_vc_addr_from_hdl(cb->vcsm_h);
++
++ return cb;
++
++fail:
++ cma_pool_delete(cb);
++ return NULL;
++}
++
++// Pool has died - safe now to exit vcsm
++static void cma_buf_pool_on_delete_cb(void * v)
++{
++ cma_buf_pool_t * const cbp = v;
++
++ cma_vcsm_exit(cbp->init_type);
++ free(cbp);
++}
++
++void cma_buf_pool_cancel(cma_buf_pool_t * const cbp)
++{
++ if (cbp == NULL || cbp->pool == NULL)
++ return;
++
++ cma_pool_fixed_cancel(cbp->pool);
++}
++
++void cma_buf_pool_uncancel(cma_buf_pool_t * const cbp)
++{
++ if (cbp == NULL || cbp->pool == NULL)
++ return;
++
++ cma_pool_fixed_uncancel(cbp->pool);
++}
++
++// User finished with pool
++void cma_buf_pool_delete(cma_buf_pool_t * const cbp)
++{
++ if (cbp == NULL)
++ return;
++
++ if (cbp->pool != NULL)
++ {
++ // We will call cma_buf_pool_on_delete_cb when the pool finally dies
++ // (might be now) which will free up our env.
++ cma_pool_fixed_kill(cbp->pool);
++ }
++ else
++ {
++ // Had no pool for some reason (error) but must still finish cleanup
++ cma_buf_pool_on_delete_cb(cbp);
++ }
++}
++
++int cma_buf_pool_fill(cma_buf_pool_t * const cbp, const size_t el_size)
++{
++ return cma_pool_fixed_fill(cbp->pool, el_size);
++}
++
++int cma_buf_pool_resize(cma_buf_pool_t * const cbp,
++ const unsigned int new_pool_size, const int new_flight_size)
++{
++ return cma_pool_fixed_resize(cbp->pool, new_pool_size, new_flight_size);
++}
++
++cma_buf_pool_t * cma_buf_pool_new(const unsigned int pool_size, const unsigned int flight_size, const bool all_in_flight, const char * const name)
++{
++ vcsm_init_type_t const init_type = cma_vcsm_init();
++ if (init_type == VCSM_INIT_NONE)
++ return NULL;
++
++ cma_buf_pool_t * const cbp = calloc(1, sizeof(cma_buf_pool_t));
++ if (cbp == NULL)
++ return NULL;
++
++ cbp->init_type = init_type;
++ cbp->all_in_flight = all_in_flight;
++
++ if ((cbp->pool = cma_pool_fixed_new(pool_size, flight_size, cbp, cma_pool_alloc_cb, cma_pool_free_cb, cma_buf_pool_on_delete_cb, name)) == NULL)
++ goto fail;
++ return cbp;
++
++fail:
++ cma_buf_pool_delete(cbp);
++ return NULL;
++}
++
++
++void cma_buf_in_flight(cma_buf_t * const cb)
++{
++ if (!cb->cbp->all_in_flight)
++ {
++ assert(!cb->in_flight);
++ cb->in_flight = true;
++ cma_pool_fixed_inc_in_flight(cb->cbp->pool);
++ }
++}
++
++void cma_buf_end_flight(cma_buf_t * const cb)
++{
++ if (cb != NULL && !cb->cbp->all_in_flight && cb->in_flight)
++ {
++ cb->in_flight = false;
++ cma_pool_fixed_dec_in_flight(cb->cbp->pool);
++ }
++}
++
++
++// Return vcsm handle
++unsigned int cma_buf_vcsm_handle(const cma_buf_t * const cb)
++{
++ return cb->vcsm_h;
++}
++
++size_t cma_buf_size(const cma_buf_t * const cb)
++{
++ return cb->size;
++}
++
++int cma_buf_add_context2(cma_buf_t *const cb, picture_context_t * const ctx2)
++{
++ if (cb->ctx2 != NULL)
++ return VLC_EGENERIC;
++
++ cb->ctx2 = ctx2;
++ return VLC_SUCCESS;
++}
++
++unsigned int cma_buf_vc_handle(const cma_buf_t *const cb)
++{
++ return cb->vc_h;
++}
++
++int cma_buf_fd(const cma_buf_t *const cb)
++{
++ return cb->fd;
++}
++
++void * cma_buf_addr(const cma_buf_t *const cb)
++{
++ return cb->mmap;
++}
++
++unsigned int cma_buf_vc_addr(const cma_buf_t *const cb)
++{
++ return cb->vc_addr;
++}
++
++
++picture_context_t * cma_buf_context2(const cma_buf_t *const cb)
++{
++ return cb->ctx2;
++}
++
++
++void cma_buf_unref(cma_buf_t * const cb)
++{
++ if (cb == NULL)
++ return;
++ if (atomic_fetch_sub(&cb->ref_count, 1) <= 1)
++ {
++ const bool was_in_flight = cb->in_flight;
++ cb->in_flight = false;
++ cma_pool_fixed_put(cb->cbp->pool, cb, cb->size, was_in_flight);
++ }
++}
++
++cma_buf_t * cma_buf_ref(cma_buf_t * const cb)
++{
++ if (cb == NULL)
++ return NULL;
++ atomic_fetch_add(&cb->ref_count, 1);
++ return cb;
++}
++
++cma_buf_t * cma_buf_pool_alloc_buf(cma_buf_pool_t * const cbp, const size_t size)
++{
++ cma_buf_t *const cb = cma_pool_fixed_get(cbp->pool, size, cbp->all_in_flight, false);
++
++ if (cb == NULL)
++ return NULL;
++
++ cb->in_flight = cbp->all_in_flight;
++ // When 1st allocated or retrieved from the pool the block will have a
++ // ref count of 0 so ref here
++ return cma_buf_ref(cb);
++}
++
+--- /dev/null
++++ b/modules/hw/mmal/mmal_cma.h
+@@ -0,0 +1,71 @@
++#ifndef VLC_MMAL_MMAL_CMA_H_
++#define VLC_MMAL_MMAL_CMA_H_
++
++
++struct cma_pool_fixed_s;
++typedef struct cma_pool_fixed_s cma_pool_fixed_t;
++
++typedef void * cma_pool_alloc_fn(void * v, size_t size);
++typedef void cma_pool_free_fn(void * v, void * el, size_t size);
++typedef void cma_pool_on_delete_fn(void * v);
++
++#if 0
++void cma_pool_fixed_unref(cma_pool_fixed_t * const p);
++void cma_pool_fixed_ref(cma_pool_fixed_t * const p);
++void * cma_pool_fixed_get(cma_pool_fixed_t * const p, const size_t req_el_size, const bool in_flight);
++void cma_pool_fixed_put(cma_pool_fixed_t * const p, void * v, const size_t el_size, const bool was_in_flight);
++void cma_pool_fixed_inc_in_flight(cma_pool_fixed_t * const p);
++void cma_pool_fixed_dec_in_flight(cma_pool_fixed_t * const p);
++void cma_pool_fixed_cancel(cma_pool_fixed_t * const p);
++void cma_pool_fixed_uncancel(cma_pool_fixed_t * const p);
++void cma_pool_fixed_kill(cma_pool_fixed_t * const p);
++int cma_pool_fixed_resize(cma_pool_fixed_t * const p,
++ const unsigned int new_pool_size, const int new_flight_size);
++cma_pool_fixed_t * cma_pool_fixed_new(const unsigned int pool_size,
++ const int flight_size,
++ void * const alloc_v,
++ cma_pool_alloc_fn * const alloc_fn, cma_pool_free_fn * const free_fn,
++ cma_pool_on_delete_fn * const on_delete_fn,
++ const char * const name);
++#endif
++
++struct cma_buf_s;
++typedef struct cma_buf_s cma_buf_t;
++
++void cma_buf_in_flight(cma_buf_t * const cb);
++void cma_buf_end_flight(cma_buf_t * const cb);
++unsigned int cma_buf_vcsm_handle(const cma_buf_t * const cb);
++size_t cma_buf_size(const cma_buf_t * const cb);
++int cma_buf_add_context2(cma_buf_t *const cb, picture_context_t * const ctx2);
++unsigned int cma_buf_vc_handle(const cma_buf_t *const cb);
++int cma_buf_fd(const cma_buf_t *const cb);
++void * cma_buf_addr(const cma_buf_t *const cb);
++unsigned int cma_buf_vc_addr(const cma_buf_t *const cb);
++picture_context_t * cma_buf_context2(const cma_buf_t *const cb);
++
++void cma_buf_unref(cma_buf_t * const cb);
++cma_buf_t * cma_buf_ref(cma_buf_t * const cb);
++
++struct cma_buf_pool_s;
++typedef struct cma_buf_pool_s cma_buf_pool_t;
++
++cma_buf_t * cma_buf_pool_alloc_buf(cma_buf_pool_t * const p, const size_t size);
++void cma_buf_pool_cancel(cma_buf_pool_t * const cbp);
++void cma_buf_pool_uncancel(cma_buf_pool_t * const cbp);
++void cma_buf_pool_delete(cma_buf_pool_t * const p);
++int cma_buf_pool_fill(cma_buf_pool_t * const cbp, const size_t el_size);
++int cma_buf_pool_resize(cma_buf_pool_t * const cbp,
++ const unsigned int new_pool_size, const int new_flight_size);
++cma_buf_pool_t * cma_buf_pool_new(const unsigned int pool_size, const unsigned int flight_size,
++ const bool all_in_flight, const char * const name);
++
++static inline void cma_buf_pool_deletez(cma_buf_pool_t ** const pp)
++{
++ cma_buf_pool_t * const p = *pp;
++ if (p != NULL) {
++ *pp = NULL;
++ cma_buf_pool_delete(p);
++ }
++}
++
++#endif // VLC_MMAL_MMAL_CMA_H_
+--- /dev/null
++++ b/modules/hw/mmal/mmal_gl.h
+@@ -0,0 +1,45 @@
++// Trim this include list!
++
++#include <libdrm/drm.h>
++#include <libdrm/drm_mode.h>
++#include <libdrm/drm_fourcc.h>
++//#include <xf86drm.h>
++//#include <xf86drmMode.h>
++#include <X11/Xlib.h>
++#include <X11/Xutil.h>
++#include <X11/Xlib-xcb.h>
++#include <epoxy/gl.h>
++#include <epoxy/egl.h>
++#include <xcb/xcb.h>
++#include <xcb/dri3.h>
++
++struct mmal_gl_converter_s;
++
++typedef struct cma_buf_s {
++ struct mmal_gl_converter_s * sys;
++
++ size_t size;
++ __u32 h_dumb;
++ int fd;
++ unsigned int h_vcsm;
++ void * mapped_addr;
++ GLuint texture;
++} cma_buf_t;
++
++typedef struct cma_pic_sys_s {
++ cma_buf_t * cmabuf;
++} cma_pic_sys_t;
++
++static inline unsigned int
++hw_mmal_h_vcsm(const picture_t * const pic)
++{
++ const cma_pic_sys_t *const pic_sys = (cma_pic_sys_t *)pic->p_sys;
++
++ if (pic->format.i_chroma != VLC_CODEC_MMAL_GL_RGB32 ||
++ pic_sys == NULL || pic_sys->cmabuf == NULL) {
++ return 0;
++ }
++
++ return pic_sys->cmabuf->h_vcsm;
++}
++
+--- /dev/null
++++ b/modules/hw/mmal/mmal_piccpy_neon.S
+@@ -0,0 +1,105 @@
++// Copy pix
++
++ .syntax unified
++ .arm
++// .thumb
++ .text
++ .align 16
++ .arch armv7-a
++ .fpu neon-vfpv4
++
++
++.macro function name
++ .global \name
++#ifdef __ELF__
++ .type \name, %function
++#endif
++\name:
++.endm
++
++
++.macro piccpy_to_8, bit_depth
++ subs r2, #128
++ vpush {q4-q7}
++ blt 2f
++1:
++ vldm r1!, {q0-q7}
++ subs r2, #128
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vldm r1!, {q8-q15}
++ vqrshrn.u16 d4, q4, #\bit_depth - 8
++ vqrshrn.u16 d5, q5, #\bit_depth - 8
++ vqrshrn.u16 d6, q6, #\bit_depth - 8
++ vqrshrn.u16 d7, q7, #\bit_depth - 8
++ vqrshrn.u16 d8, q8, #\bit_depth - 8
++ vqrshrn.u16 d9, q9, #\bit_depth - 8
++ vqrshrn.u16 d10, q10, #\bit_depth - 8
++ vqrshrn.u16 d11, q11, #\bit_depth - 8
++ vqrshrn.u16 d12, q12, #\bit_depth - 8
++ vqrshrn.u16 d13, q13, #\bit_depth - 8
++ vqrshrn.u16 d14, q14, #\bit_depth - 8
++ vqrshrn.u16 d15, q15, #\bit_depth - 8
++ vstm r0!, {q0-q7}
++ bge 1b
++2:
++ adds r2, #64
++ blt 1f
++
++ vldm r1!, {q0-q7}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vqrshrn.u16 d4, q4, #\bit_depth - 8
++ vqrshrn.u16 d5, q5, #\bit_depth - 8
++ vqrshrn.u16 d6, q6, #\bit_depth - 8
++ vqrshrn.u16 d7, q7, #\bit_depth - 8
++ vstm r0!, {q0-q3}
++1:
++ adds r2, #32
++ blt 1f
++
++ vldm r1!, {q0-q3}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vstm r0!, {q0-q1}
++1:
++ adds r2, #16
++ blt 1f
++
++ vldm r1!, {q0-q1}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vstm r0!, {q0}
++1:
++ adds r2, #8
++ blt 1f
++
++ vldm r1!, {q0}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vstr d0, [r0]
++ add r0, #8
++1:
++ adds r2, #4
++ blt 1f
++
++ vldr d0, [r1]
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vstr s0, [r0]
++1:
++ vpop {q4-q7}
++ bx lr
++.endm
++
++
++@ [r0] Dest
++@ [r1] Src
++@ r2 Pels
++function mmal_piccpy_10_to_8_neon
++ piccpy_to_8 10
++
+--- a/modules/hw/mmal/mmal_picture.c
++++ b/modules/hw/mmal/mmal_picture.c
+@@ -21,25 +21,1542 @@
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
++// We would really like to use vlc_thread.h but the detach thread stuff can't be
++// used here :-(
++#include <pthread.h>
++
++#include <stdatomic.h>
++#include <unistd.h>
++#include <fcntl.h>
++
+ #include <vlc_common.h>
++#include <vlc_cpu.h>
+ #include <vlc_picture.h>
++
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Wbad-function-cast"
++#include <bcm_host.h>
++#pragma GCC diagnostic pop
+ #include <interface/mmal/mmal.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++#include <interface/vmcs_host/vcgencmd.h>
++#include <interface/vcsm/user-vcsm.h>
+
++#include "mmal_cma.h"
+ #include "mmal_picture.h"
++#include "transform_ops.h"
++
++#define TRACE_TRANSFORMS 0
++
++#define UINT64_SIZE(s) (((s) + sizeof(uint64_t) - 1)/sizeof(uint64_t))
++
++static inline char safe_char(const unsigned int c0)
++{
++ const unsigned int c = c0 & 0xff;
++ return c > ' ' && c < 0x7f ? c : '.';
++}
++
++const char * str_fourcc(char * const buf, const unsigned int fcc)
++{
++ if (fcc == 0)
++ return "----";
++ buf[0] = safe_char(fcc >> 0);
++ buf[1] = safe_char(fcc >> 8);
++ buf[2] = safe_char(fcc >> 16);
++ buf[3] = safe_char(fcc >> 24);
++ buf[4] = 0;
++ return buf;
++}
++
++// WB + Inv
++static inline void flush_range(void * const start, const size_t len)
++{
++ uint64_t buf[UINT64_SIZE(sizeof(struct vcsm_user_clean_invalid2_s) + sizeof(struct vcsm_user_clean_invalid2_block_s))];
++ struct vcsm_user_clean_invalid2_s * const b = (struct vcsm_user_clean_invalid2_s *)buf;
++
++ *b = (struct vcsm_user_clean_invalid2_s){
++ .op_count = 1
++ };
++
++ b->s[0] = (struct vcsm_user_clean_invalid2_block_s){
++ .invalidate_mode = 3, // wb + invalidate
++ .block_count = 1,
++ .start_address = start, // Rely on clean inv to fix up align & size boundries
++ .block_size = len,
++ .inter_block_stride = 0
++ };
++
++ vcsm_clean_invalid2(b);
++}
++
++MMAL_FOURCC_T vlc_to_mmal_color_space(const video_color_space_t vlc_cs)
++{
++ switch (vlc_cs)
++ {
++ case COLOR_SPACE_BT601:
++ return MMAL_COLOR_SPACE_ITUR_BT601;
++ case COLOR_SPACE_BT709:
++ return MMAL_COLOR_SPACE_ITUR_BT709;
++ default:
++ break;
++ }
++ return MMAL_COLOR_SPACE_UNKNOWN;
++}
++
++MMAL_FOURCC_T vlc_to_mmal_video_fourcc(const video_frame_format_t * const vf_vlc)
++{
++ switch (vf_vlc->i_chroma) {
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ case VLC_CODEC_RGB32:
++ {
++ // VLC RGB32 aka RV32 means we have to look at the mask values
++ const uint32_t r = vf_vlc->i_rmask;
++ const uint32_t g = vf_vlc->i_gmask;
++ const uint32_t b = vf_vlc->i_bmask;
++ if (r == 0xff0000 && g == 0xff00 && b == 0xff)
++ return MMAL_ENCODING_BGRA;
++ if (r == 0xff && g == 0xff00 && b == 0xff0000)
++ return MMAL_ENCODING_RGBA;
++ if (r == 0xff000000 && g == 0xff0000 && b == 0xff00)
++ return MMAL_ENCODING_ABGR;
++ if (r == 0xff00 && g == 0xff0000 && b == 0xff000000)
++ return MMAL_ENCODING_ARGB;
++ break;
++ }
++ case VLC_CODEC_RGB16:
++ {
++ // VLC RGB16 aka RV16 means we have to look at the mask values
++ const uint32_t r = vf_vlc->i_rmask;
++ const uint32_t g = vf_vlc->i_gmask;
++ const uint32_t b = vf_vlc->i_bmask;
++ if (r == 0xf800 && g == 0x7e0 && b == 0x1f)
++ return MMAL_ENCODING_RGB16;
++ break;
++ }
++ case VLC_CODEC_I420:
++ case VLC_CODEC_MMAL_ZC_I420:
++ return MMAL_ENCODING_I420;
++ case VLC_CODEC_RGBA:
++ return MMAL_ENCODING_RGBA;
++ case VLC_CODEC_BGRA:
++ return MMAL_ENCODING_BGRA;
++ case VLC_CODEC_ARGB:
++ return MMAL_ENCODING_ARGB;
++ // VLC_CODEC_ABGR does not exist in VLC
++ case VLC_CODEC_MMAL_OPAQUE:
++ return MMAL_ENCODING_OPAQUE;
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ return MMAL_ENCODING_YUVUV128;
++ case VLC_CODEC_MMAL_ZC_SAND10:
++ return MMAL_ENCODING_YUVUV64_10;
++ case VLC_CODEC_MMAL_ZC_SAND30:
++ return MMAL_ENCODING_YUV10_COL;
++ default:
++ break;
++ }
++ return 0;
++}
++
++static void vlc_fmt_to_video_format(MMAL_VIDEO_FORMAT_T *const vf_mmal, const video_frame_format_t * const vf_vlc)
++{
++ const unsigned int wmask = (vf_vlc->i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ vf_vlc->i_chroma == VLC_CODEC_I420) ? 31 : 15;
++
++ vf_mmal->width = (vf_vlc->i_width + wmask) & ~wmask;
++ vf_mmal->height = (vf_vlc->i_height + 15) & ~15;
++ vf_mmal->crop.x = vf_vlc->i_x_offset;
++ vf_mmal->crop.y = vf_vlc->i_y_offset;
++ vf_mmal->crop.width = vf_vlc->i_visible_width;
++ vf_mmal->crop.height = vf_vlc->i_visible_height;
++ if (vf_vlc->i_sar_num == 0 || vf_vlc->i_sar_den == 0) {
++ vf_mmal->par.num = 1;
++ vf_mmal->par.den = 1;
++ } else {
++ vf_mmal->par.num = vf_vlc->i_sar_num;
++ vf_mmal->par.den = vf_vlc->i_sar_den;
++ }
++ vf_mmal->frame_rate.num = vf_vlc->i_frame_rate;
++ vf_mmal->frame_rate.den = vf_vlc->i_frame_rate_base;
++ vf_mmal->color_space = vlc_to_mmal_color_space(vf_vlc->space);
++}
++
++
++void hw_mmal_vlc_fmt_to_mmal_fmt(MMAL_ES_FORMAT_T *const es_fmt, const video_frame_format_t * const vf_vlc)
++{
++ vlc_fmt_to_video_format(&es_fmt->es->video, vf_vlc);
++}
++
++bool hw_mmal_vlc_pic_to_mmal_fmt_update(MMAL_ES_FORMAT_T *const es_fmt, const picture_t * const pic)
++{
++ MMAL_VIDEO_FORMAT_T vf_new_ss;
++ MMAL_VIDEO_FORMAT_T *const vf_old = &es_fmt->es->video;
++ MMAL_VIDEO_FORMAT_T *const vf_new = &vf_new_ss;
++
++ vlc_fmt_to_video_format(vf_new, &pic->format);
++
++ // If we have a format that might have come from ffmpeg then rework for
++ // a better guess as to layout. All sand stuff is "special" with regards to
++ // width/height vs real layout so leave as is if that
++ if ((pic->format.i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ pic->format.i_chroma == VLC_CODEC_MMAL_ZC_RGB32) &&
++ pic->p[0].i_pixel_pitch != 0)
++ {
++ // Now overwrite width/height with a better guess as to actual layout info
++ vf_new->height = pic->p[0].i_lines;
++ vf_new->width = pic->p[0].i_pitch / pic->p[0].i_pixel_pitch;
++ }
++
++ if (
++ vf_new->width != vf_old->width ||
++ vf_new->height != vf_old->height ||
++ vf_new->crop.x != vf_old->crop.x ||
++ vf_new->crop.y != vf_old->crop.y ||
++ vf_new->crop.width != vf_old->crop.width ||
++ vf_new->crop.height != vf_old->crop.height ||
++ vf_new->par.num != vf_old->par.num ||
++ vf_new->par.den != vf_old->par.den ||
++ // Frame rate ignored
++ vf_new->color_space != vf_old->color_space)
++ {
++#if 0
++ char dbuf0[5], dbuf1[5];
++ printf("%dx%d (%d,%d %dx%d) par:%d/%d %s -> %dx%d (%d,%d %dx%d) par:%d/%d %s\n",
++ vf_old->width ,
++ vf_old->height ,
++ vf_old->crop.x ,
++ vf_old->crop.y ,
++ vf_old->crop.width ,
++ vf_old->crop.height ,
++ vf_old->par.num ,
++ vf_old->par.den ,
++ str_fourcc(dbuf0, vf_old->color_space) ,
++ vf_new->width ,
++ vf_new->height ,
++ vf_new->crop.x ,
++ vf_new->crop.y ,
++ vf_new->crop.width ,
++ vf_new->crop.height ,
++ vf_new->par.num ,
++ vf_new->par.den ,
++ str_fourcc(dbuf1, vf_new->color_space) );
++#endif
++ *vf_old = *vf_new;
++ return true;
++ }
++ return false;
++}
++
++
++hw_mmal_port_pool_ref_t * hw_mmal_port_pool_ref_create(MMAL_PORT_T * const port,
++ const unsigned int headers, const uint32_t payload_size)
++{
++ hw_mmal_port_pool_ref_t * ppr = calloc(1, sizeof(hw_mmal_port_pool_ref_t));
++ if (ppr == NULL)
++ return NULL;
++
++ if ((ppr->pool = mmal_port_pool_create(port, headers, payload_size)) == NULL)
++ goto fail;
++
++ ppr->port = port;
++ atomic_store(&ppr->refs, 1);
++ return ppr;
++
++fail:
++ free(ppr);
++ return NULL;
++}
++
++static void do_detached(void *(*fn)(void *), void * v)
++{
++ pthread_t dothread;
++ pthread_create(&dothread, NULL, fn, v);
++ pthread_detach(dothread);
++}
++
++// Destroy a ppr - aranged s.t. it has the correct prototype for a pthread
++static void * kill_ppr(void * v)
++{
++ hw_mmal_port_pool_ref_t * const ppr = v;
++ if (ppr->port->is_enabled)
++ mmal_port_disable(ppr->port); // Avoid annoyed messages from MMAL when we kill the pool
++ mmal_port_pool_destroy(ppr->port, ppr->pool);
++ free(ppr);
++ return NULL;
++}
++
++void hw_mmal_port_pool_ref_release(hw_mmal_port_pool_ref_t * const ppr, const bool in_cb)
++{
++ if (ppr == NULL)
++ return;
++ if (atomic_fetch_sub(&ppr->refs, 1) != 1)
++ return;
++ if (in_cb)
++ do_detached(kill_ppr, ppr);
++ else
++ kill_ppr(ppr);
++}
++
++// Put buffer in port if possible - if not then release to pool
++// Returns true if sent, false if recycled
++bool hw_mmal_port_pool_ref_recycle(hw_mmal_port_pool_ref_t * const ppr, MMAL_BUFFER_HEADER_T * const buf)
++{
++ mmal_buffer_header_reset(buf);
++ buf->user_data = NULL;
++
++ if (mmal_port_send_buffer(ppr->port, buf) == MMAL_SUCCESS)
++ return true;
++ mmal_buffer_header_release(buf);
++ return false;
++}
++
++MMAL_STATUS_T hw_mmal_port_pool_ref_fill(hw_mmal_port_pool_ref_t * const ppr)
++{
++ MMAL_BUFFER_HEADER_T * buf;
++ MMAL_STATUS_T err = MMAL_SUCCESS;
++
++ while ((buf = mmal_queue_get(ppr->pool->queue)) != NULL) {
++ if ((err = mmal_port_send_buffer(ppr->port, buf)) != MMAL_SUCCESS)
++ {
++ mmal_queue_put_back(ppr->pool->queue, buf);
++ break;
++ }
++ }
++ return err;
++}
++
++
++MMAL_STATUS_T hw_mmal_opaque_output(vlc_object_t * const obj,
++ hw_mmal_port_pool_ref_t ** pppr,
++ MMAL_PORT_T * const port,
++ const unsigned int extra_buffers, MMAL_PORT_BH_CB_T callback)
++{
++ MMAL_STATUS_T status;
++
++ port->userdata = (struct MMAL_PORT_USERDATA_T *)obj;
++
++ status = port_parameter_set_uint32(port, MMAL_PARAMETER_EXTRA_BUFFERS, extra_buffers);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(obj, "Failed to set MMAL_PARAMETER_EXTRA_BUFFERS on output port (status=%"PRIx32" %s)",
++ status, mmal_status_to_string(status));
++ return status;
++ }
++
++ status = port_parameter_set_bool(port, MMAL_PARAMETER_ZERO_COPY, 1);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(obj, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
++ port->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ port->format->encoding = MMAL_ENCODING_OPAQUE;
++ port->format->encoding_variant = 0;
++ if ((status = mmal_port_format_commit(port)) != MMAL_SUCCESS)
++ {
++ msg_Err(obj, "Failed to commit format on port %s (status=%"PRIx32" %s)",
++ port->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ port->buffer_num = 30;
++ port->buffer_size = port->buffer_size_recommended;
++
++ if ((*pppr = hw_mmal_port_pool_ref_create(port, port->buffer_num, port->buffer_size)) == NULL) {
++ msg_Err(obj, "Failed to create output pool");
++ return status;
++ }
++
++ status = mmal_port_enable(port, callback);
++ if (status != MMAL_SUCCESS) {
++ hw_mmal_port_pool_ref_release(*pppr, false);
++ *pppr = NULL;
++ msg_Err(obj, "Failed to enable output port %s (status=%"PRIx32" %s)",
++ port->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ return MMAL_SUCCESS;
++}
++
++
++void hw_mmal_pic_ctx_destroy(picture_context_t * pic_ctx_cmn)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic_ctx_cmn;
++ unsigned int i;
++
++ for (i = 0; i != ctx->buf_count; ++i) {
++ if (ctx->bufs[i] != NULL)
++ mmal_buffer_header_release(ctx->bufs[i]);
++ }
++
++ cma_buf_end_flight(ctx->cb);
++ cma_buf_unref(ctx->cb);
++
++ free(ctx);
++}
++
++picture_context_t * hw_mmal_pic_ctx_copy(picture_context_t * pic_ctx_cmn)
++{
++ const pic_ctx_mmal_t * const src_ctx = (pic_ctx_mmal_t *)pic_ctx_cmn;
++ pic_ctx_mmal_t * const dst_ctx = calloc(1, sizeof(*dst_ctx));
++ unsigned int i;
++
++ if (dst_ctx == NULL)
++ return NULL;
++
++ // Copy
++ dst_ctx->cmn = src_ctx->cmn;
++
++ dst_ctx->cb = cma_buf_ref(src_ctx->cb);
++
++ dst_ctx->buf_count = src_ctx->buf_count;
++ for (i = 0; i != src_ctx->buf_count; ++i) {
++ dst_ctx->bufs[i] = src_ctx->bufs[i];
++ if (dst_ctx->bufs[i] != NULL)
++ mmal_buffer_header_acquire(dst_ctx->bufs[i]);
++ }
++
++ return &dst_ctx->cmn;
++}
++
++static MMAL_BOOL_T
++buf_pre_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
++{
++ hw_mmal_port_pool_ref_t * const ppr = userdata;
++
++ // Kill the callback - otherwise we will go in circles!
++ mmal_buffer_header_pre_release_cb_set(buf, (MMAL_BH_PRE_RELEASE_CB_T)0, NULL);
++ mmal_buffer_header_acquire(buf); // Ref it again
++
++ // As we have re-acquired the buffer we need a full release
++ // (not continue) to zap the ref count back to zero
++ // This is "safe" 'cos we have already reset the cb
++ hw_mmal_port_pool_ref_recycle(ppr, buf);
++ hw_mmal_port_pool_ref_release(ppr, true); // Assume in callback
++
++ return MMAL_TRUE;
++}
++
++// Buffer belongs to context on successful return from this fn
++// is still valid on failure
++picture_context_t *
++hw_mmal_gen_context(MMAL_BUFFER_HEADER_T * buf, hw_mmal_port_pool_ref_t * const ppr)
++{
++ pic_ctx_mmal_t * const ctx = calloc(1, sizeof(pic_ctx_mmal_t));
++
++ if (ctx == NULL)
++ return NULL;
++
++ // If we have an associated ppr then ref & set appropriate callbacks
++ if (ppr != NULL) {
++ hw_mmal_port_pool_ref_acquire(ppr);
++ mmal_buffer_header_pre_release_cb_set(buf, buf_pre_release_cb, ppr);
++ buf->user_data = NULL;
++ }
++
++ ctx->cmn.copy = hw_mmal_pic_ctx_copy;
++ ctx->cmn.destroy = hw_mmal_pic_ctx_destroy;
++
++ ctx->buf_count = 1;
++ ctx->bufs[0] = buf;
++
++ return &ctx->cmn;
++}
++
++// n is els
++// * Make NEON!
++typedef void piccpy_fn(void * dest, const void * src, size_t n);
++
++extern piccpy_fn mmal_piccpy_10_to_8_neon;
++
++static void piccpy_10_to_8_c(void * dest, const void * src, size_t n)
++{
++ uint8_t * d = dest;
++ const uint16_t * s = src;
++ while (n-- != 0)
++ *d++ = *s++ >> 2;
++}
++
++// Do a stride converting copy - if the strides are the same and line_len is
++// close then do a single block copy - we don't expect to have to preserve
++// pixels in the output frame
++static void mem_copy_2d(uint8_t * d_ptr, const size_t d_stride,
++ const uint8_t * s_ptr, const size_t s_stride,
++ size_t lines, const size_t line_len)
++{
++ if (s_stride == d_stride && d_stride < line_len + 32)
++ {
++ memcpy(d_ptr, s_ptr, d_stride * lines);
++ }
++ else
++ {
++ while (lines-- != 0) {
++ memcpy(d_ptr, s_ptr, line_len);
++ d_ptr += d_stride;
++ s_ptr += s_stride;
++ }
++ }
++}
++
++// line_len in D units
++static void mem_copy_2d_10_to_8(uint8_t * d_ptr, const size_t d_stride,
++ const uint8_t * s_ptr, const size_t s_stride,
++ size_t lines, const size_t line_len)
++{
++ piccpy_fn * const docpy = vlc_CPU_ARM_NEON() ? mmal_piccpy_10_to_8_neon : piccpy_10_to_8_c;
++ if (s_stride == d_stride * 2 && d_stride < line_len + 32)
++ {
++ docpy(d_ptr, s_ptr, d_stride * lines);
++ }
++ else
++ {
++ while (lines-- != 0) {
++ docpy(d_ptr, s_ptr, line_len);
++ d_ptr += d_stride;
++ s_ptr += s_stride;
++ }
++ }
++}
++
++
++int hw_mmal_copy_pic_to_buf(void * const buf_data,
++ uint32_t * const pLength,
++ const MMAL_ES_FORMAT_T * const fmt,
++ const picture_t * const pic)
++{
++ const MMAL_VIDEO_FORMAT_T *const video = &fmt->es->video;
++ uint8_t * const dest = buf_data;
++ size_t length = 0;
++
++ //**** Worry about x/y_offsets
++
++ assert(fmt->encoding == MMAL_ENCODING_I420);
++
++ switch (pic->format.i_chroma) {
++ case VLC_CODEC_I420:
++ {
++ const size_t y_size = video->width * video->height;
++ mem_copy_2d(dest, video->width,
++ pic->p[0].p_pixels, pic->p[0].i_pitch,
++ video->crop.height,
++ video->crop.width);
++
++ mem_copy_2d(dest + y_size, video->width / 2,
++ pic->p[1].p_pixels, pic->p[1].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ mem_copy_2d(dest + y_size + y_size / 4, video->width / 2,
++ pic->p[2].p_pixels, pic->p[2].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ // And make sure it is actually in memory
++ length = y_size + y_size / 2;
++ break;
++ }
++
++ case VLC_CODEC_I420_10L:
++ {
++ const size_t y_size = video->width * video->height;
++ mem_copy_2d_10_to_8(dest, video->width,
++ pic->p[0].p_pixels, pic->p[0].i_pitch,
++ video->crop.height,
++ video->crop.width);
++
++ mem_copy_2d_10_to_8(dest + y_size, video->width / 2,
++ pic->p[1].p_pixels, pic->p[1].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ mem_copy_2d_10_to_8(dest + y_size + y_size / 4, video->width / 2,
++ pic->p[2].p_pixels, pic->p[2].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ // And make sure it is actually in memory
++ length = y_size + y_size / 2;
++ break;
++ }
++
++ default:
++ if (pLength != NULL)
++ *pLength = 0;
++ return VLC_EBADVAR;
++ }
++
++ if (cma_vcsm_type() == VCSM_INIT_LEGACY) { // ** CMA is currently always uncached
++ flush_range(dest, length);
++ }
++
++ if (pLength != NULL)
++ *pLength = (uint32_t)length;
++
++ return VLC_SUCCESS;
++}
++
++
++static MMAL_BOOL_T rep_buf_free_cb(MMAL_BUFFER_HEADER_T *header, void *userdata)
++{
++ cma_buf_t * const cb = userdata;
++ VLC_UNUSED(header);
++
++ cma_buf_unref(cb);
++ return MMAL_FALSE;
++}
++
++static int cma_buf_buf_attach(MMAL_BUFFER_HEADER_T * const buf, cma_buf_t * const cb)
++{
++ // Just a CMA buffer - fill in new buffer
++ const uintptr_t vc_h = cma_buf_vc_handle(cb);
++ if (vc_h == 0)
++ return VLC_EGENERIC;
++
++ mmal_buffer_header_reset(buf);
++ buf->data = (uint8_t *)vc_h;
++ buf->alloc_size = cma_buf_size(cb);
++ buf->length = buf->alloc_size;
++ // Ensure cb remains valid for the duration of this buffer
++ mmal_buffer_header_pre_release_cb_set(buf, rep_buf_free_cb, cma_buf_ref(cb));
++ return VLC_SUCCESS;
++}
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_copied(const picture_t *const pic,
++ MMAL_POOL_T * const rep_pool,
++ MMAL_PORT_T * const port,
++ cma_buf_pool_t * const cbp)
++{
++ MMAL_BUFFER_HEADER_T *const buf = mmal_queue_wait(rep_pool->queue);
++ if (buf == NULL)
++ goto fail0;
++
++ cma_buf_t * const cb = cma_buf_pool_alloc_buf(cbp, port->buffer_size);
++ if (cb == NULL)
++ goto fail1;
++
++ if (cma_buf_buf_attach(buf, cb) != VLC_SUCCESS)
++ goto fail2;
++
++ pic_to_buf_copy_props(buf, pic);
++
++ if (hw_mmal_copy_pic_to_buf(cma_buf_addr(cb), &buf->length, port->format, pic) != VLC_SUCCESS)
++ goto fail2;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++
++ cma_buf_unref(cb);
++ return buf;
++
++fail2:
++ cma_buf_unref(cb);
++fail1:
++ mmal_buffer_header_release(buf);
++fail0:
++ return NULL;
++}
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_replicated(const picture_t *const pic, MMAL_POOL_T * const rep_pool)
++{
++ pic_ctx_mmal_t *const ctx = (pic_ctx_mmal_t *)pic->context;
++ MMAL_BUFFER_HEADER_T *const rep_buf = mmal_queue_wait(rep_pool->queue);
++
++ if (rep_buf == NULL)
++ return NULL;
++
++ if (ctx->bufs[0] != NULL)
++ {
++ // Existing buffer - replicate it
++ if (mmal_buffer_header_replicate(rep_buf, ctx->bufs[0]) != MMAL_SUCCESS)
++ goto fail;
++ }
++ else if (ctx->cb != NULL)
++ {
++ // Just a CMA buffer - fill in new buffer
++ if (cma_buf_buf_attach(rep_buf, ctx->cb) != 0)
++ goto fail;
++ }
++ else
++ goto fail;
++
++ pic_to_buf_copy_props(rep_buf, pic);
++ return rep_buf;
++
++fail:
++ mmal_buffer_header_release(rep_buf);
++ return NULL;
++}
++
++
++
++
++int hw_mmal_get_gpu_mem(void) {
++ static int stashed_val = -2;
++ VCHI_INSTANCE_T vchi_instance;
++ VCHI_CONNECTION_T *vchi_connection = NULL;
++ char rbuf[1024] = { 0 };
++
++ if (stashed_val >= -1)
++ return stashed_val;
++
++ if (vchi_initialise(&vchi_instance) != 0)
++ goto fail0;
++
++ //create a vchi connection
++ if (vchi_connect(NULL, 0, vchi_instance) != 0)
++ goto fail0;
++
++ vc_vchi_gencmd_init(vchi_instance, &vchi_connection, 1);
++
++ //send the gencmd for the argument
++ if (vc_gencmd_send("get_mem gpu") != 0)
++ goto fail;
++
++ if (vc_gencmd_read_response(rbuf, sizeof(rbuf) - 1) != 0)
++ goto fail;
++
++ if (strncmp(rbuf, "gpu=", 4) != 0)
++ goto fail;
++
++ char *p;
++ unsigned long m = strtoul(rbuf + 4, &p, 10);
++
++ if (p[0] != 'M' || p[1] != '\0')
++ stashed_val = -1;
++ else
++ stashed_val = (int)m << 20;
++
++ vc_gencmd_stop();
++
++ //close the vchi connection
++ vchi_disconnect(vchi_instance);
++
++ return stashed_val;
++
++fail:
++ vc_gencmd_stop();
++ vchi_disconnect(vchi_instance);
++fail0:
++ stashed_val = -1;
++ return -1;
++};
++
++// ===========================================================================
++
++typedef struct pool_ent_s
++{
++ struct pool_ent_s * next;
++ struct pool_ent_s * prev;
++
++ atomic_int ref_count;
++ unsigned int seq;
++
++ size_t size;
++
++ int vcsm_hdl;
++ int vc_hdl;
++ void * buf;
++
++ unsigned int width;
++ unsigned int height;
++ MMAL_FOURCC_T enc_type;
++
++ picture_t * pic;
++} pool_ent_t;
++
++
++typedef struct ent_list_hdr_s
++{
++ pool_ent_t * ents;
++ pool_ent_t * tail;
++ unsigned int n;
++} ent_list_hdr_t;
++
++#define ENT_LIST_HDR_INIT (ent_list_hdr_t){ \
++ .ents = NULL, \
++ .tail = NULL, \
++ .n = 0 \
++}
++
++struct vzc_pool_ctl_s
++{
++ atomic_int ref_count;
++
++ ent_list_hdr_t ent_pool;
++ ent_list_hdr_t ents_cur;
++ ent_list_hdr_t ents_prev;
++
++ unsigned int max_n;
++ unsigned int seq;
++
++ vlc_mutex_t lock;
++
++ MMAL_POOL_T * buf_pool;
++
++ vcsm_init_type_t vcsm_init_type;
++};
++
++typedef struct vzc_subbuf_ent_s
++{
++ pool_ent_t * ent;
++ MMAL_RECT_T pic_rect;
++ MMAL_RECT_T orig_dest_rect;
++ MMAL_DISPLAYREGION_T dreg;
++} vzc_subbuf_ent_t;
++
++
++static pool_ent_t * ent_extract(ent_list_hdr_t * const elh, pool_ent_t * const ent)
++{
++// printf("List %p [%d]: Ext %p\n", elh, elh->n, ent);
++
++ if (ent == NULL)
++ return NULL;
++
++ if (ent->next == NULL)
++ elh->tail = ent->prev;
++ else
++ ent->next->prev = ent->prev;
++
++ if (ent->prev == NULL)
++ elh->ents = ent->next;
++ else
++ ent->prev->next = ent->next;
++
++ ent->prev = ent->next = NULL;
++
++ --elh->n;
++
++ return ent; // For convienience
++}
++
++static inline pool_ent_t * ent_extract_tail(ent_list_hdr_t * const elh)
++{
++ return ent_extract(elh, elh->tail);
++}
++
++static void ent_add_head(ent_list_hdr_t * const elh, pool_ent_t * const ent)
++{
++// printf("List %p [%d]: Add %p\n", elh, elh->n, ent);
++
++ if ((ent->next = elh->ents) == NULL)
++ elh->tail = ent;
++ else
++ ent->next->prev = ent;
++
++ ent->prev = NULL;
++ elh->ents = ent;
++ ++elh->n;
++}
++
++static void ent_free(pool_ent_t * const ent)
++{
++// printf("Free ent: %p\n", ent);
++ if (ent != NULL) {
++ // If we still have a ref to a pic - kill it now
++ if (ent->pic != NULL)
++ picture_Release(ent->pic);
++
++ // Free contents
++ vcsm_unlock_hdl(ent->vcsm_hdl);
++
++ vcsm_free(ent->vcsm_hdl);
++
++ free(ent);
++ }
++}
++
++static void ent_free_list(ent_list_hdr_t * const elh)
++{
++ pool_ent_t * ent = elh->ents;
++
++// printf("Free list: %p [%d]\n", elh, elh->n);
++
++ *elh = ENT_LIST_HDR_INIT;
++
++ while (ent != NULL) {
++ pool_ent_t * const t = ent;
++ ent = t->next;
++ ent_free(t);
++ }
++}
++
++static void ent_list_move(ent_list_hdr_t * const dst, ent_list_hdr_t * const src)
++{
++// printf("Move %p->%p\n", src, dst);
++
++ *dst = *src;
++ *src = ENT_LIST_HDR_INIT;
++}
++
++// Scans "backwards" as that should give us the fastest match if we are
++// presented with pics in the same order each time
++static pool_ent_t * ent_list_extract_pic_ent(ent_list_hdr_t * const elh, picture_t * const pic)
++{
++ pool_ent_t *ent = elh->tail;
++
++// printf("Find list: %p [%d]; pic:%p\n", elh, elh->n, pic);
++
++ while (ent != NULL) {
++// printf("Check ent: %p, pic:%p\n", ent, ent->pic);
++
++ if (ent->pic == pic)
++ return ent_extract(elh, ent);
++ ent = ent->prev;
++ }
++ return NULL;
++}
++
++#define POOL_ENT_ALLOC_BLOCK 0x10000
++
++static pool_ent_t * pool_ent_alloc_new(size_t req_size)
++{
++ pool_ent_t * ent = calloc(1, sizeof(*ent));
++ const size_t alloc_size = (req_size + POOL_ENT_ALLOC_BLOCK - 1) & ~(POOL_ENT_ALLOC_BLOCK - 1);
++
++ if (ent == NULL)
++ return NULL;
++
++ ent->next = ent->prev = NULL;
++
++ // Alloc from vcsm
++ if ((ent->vcsm_hdl = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST, (char *)"vlc-subpic")) == -1)
++ goto fail1;
++ if ((ent->vc_hdl = vcsm_vc_hdl_from_hdl(ent->vcsm_hdl)) == 0)
++ goto fail2;
++ if ((ent->buf = vcsm_lock(ent->vcsm_hdl)) == NULL)
++ goto fail2;
++
++ ent->size = alloc_size;
++ return ent;
++
++fail2:
++ vcsm_free(ent->vcsm_hdl);
++fail1:
++ free(ent);
++ return NULL;
++}
++
++static inline pool_ent_t * pool_ent_ref(pool_ent_t * const ent)
++{
++// int n = atomic_fetch_add(&ent->ref_count, 1) + 1;
++// printf("Ref: %p: %d\n", ent, n);
++ atomic_fetch_add(&ent->ref_count, 1);
++ return ent;
++}
++
++static void pool_recycle(vzc_pool_ctl_t * const pc, pool_ent_t * const ent)
++{
++ pool_ent_t * xs = NULL;
++ int n;
++
++ if (ent == NULL)
++ return;
++
++ n = atomic_fetch_sub(&ent->ref_count, 1) - 1;
++
++// printf("%s: Pool: %p: Ent: %p: %d\n", __func__, &pc->ent_pool, ent, n);
++
++ if (n != 0)
++ return;
++
++ if (ent->pic != NULL) {
++ picture_Release(ent->pic);
++ ent->pic = NULL;
++ }
++
++ vlc_mutex_lock(&pc->lock);
++
++ // If we have a full pool then extract the LRU and free it
++ // Free done outside mutex
++ if (pc->ent_pool.n >= pc->max_n)
++ xs = ent_extract_tail(&pc->ent_pool);
++
++ ent_add_head(&pc->ent_pool, ent);
++
++ vlc_mutex_unlock(&pc->lock);
++
++ ent_free(xs);
++}
++
++// * This could be made more efficient, but this is easy
++static void pool_recycle_list(vzc_pool_ctl_t * const pc, ent_list_hdr_t * const elh)
++{
++ pool_ent_t * ent;
++ while ((ent = ent_extract_tail(elh)) != NULL) {
++ pool_recycle(pc, ent);
++ }
++}
++
++static pool_ent_t * pool_best_fit(vzc_pool_ctl_t * const pc, size_t req_size)
++{
++ pool_ent_t * best = NULL;
++
++ vlc_mutex_lock(&pc->lock);
++
++ {
++ pool_ent_t * ent = pc->ent_pool.ents;
++
++ // Simple scan
++ while (ent != NULL) {
++ if (ent->size >= req_size && ent->size <= req_size * 2 + POOL_ENT_ALLOC_BLOCK &&
++ (best == NULL || best->size > ent->size))
++ best = ent;
++ ent = ent->next;
++ }
++
++ // extract best from chain if we've found it
++ ent_extract(&pc->ent_pool, best);
++ }
++
++ vlc_mutex_unlock(&pc->lock);
++
++ if (best == NULL)
++ best = pool_ent_alloc_new(req_size);
++
++ if ((best->seq = ++pc->seq) == 0)
++ best->seq = ++pc->seq; // Never allow to be zero
++
++ atomic_store(&best->ref_count, 1);
++ return best;
++}
++
++
++const vlc_fourcc_t hw_mmal_vzc_subpicture_chromas[] = { VLC_CODEC_RGBA, VLC_CODEC_BGRA, VLC_CODEC_ARGB, 0 };
++
++void hw_mmal_vzc_buf_get_wh(MMAL_BUFFER_HEADER_T * const buf, int * const pW, int * const pH)
++{
++ const pool_ent_t *const ent = ((vzc_subbuf_ent_t *)buf->user_data)->ent;
++ *pW = ent->width;
++ *pH = ent->height;
++}
++
++bool hw_mmal_vzc_buf_set_format(MMAL_BUFFER_HEADER_T * const buf, MMAL_ES_FORMAT_T * const es_fmt)
++{
++ const pool_ent_t *const ent = ((vzc_subbuf_ent_t *)buf->user_data)->ent;
++ MMAL_VIDEO_FORMAT_T * const v_fmt = &es_fmt->es->video;
++
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = ent->enc_type;
++ es_fmt->encoding_variant = 0;
++
++ v_fmt->width = ent->width;
++ v_fmt->height = ent->height;
++ v_fmt->crop.x = 0;
++ v_fmt->crop.y = 0;
++ v_fmt->crop.width = ent->width;
++ v_fmt->crop.height = ent->height;
++
++ return true;
++}
++
++void hw_mmal_vzc_buf_frame_size(MMAL_BUFFER_HEADER_T * const buf,
++ uint32_t * const pWidth, uint32_t * const pHeight)
++{
++ const pool_ent_t *const ent = ((vzc_subbuf_ent_t *)buf->user_data)->ent;
++ *pWidth = ent->width;
++ *pHeight = ent->height;
++}
++
++
++MMAL_DISPLAYREGION_T * hw_mmal_vzc_buf_region(MMAL_BUFFER_HEADER_T * const buf)
++{
++ vzc_subbuf_ent_t * sb = buf->user_data;
++ return &sb->dreg;
++}
++
++static inline int rescale_x(int x, int mul, int div)
++{
++ return div == 0 ? x * mul : (x * mul + div/2) / div;
++}
++
++static void rescale_rect(MMAL_RECT_T * const d, const MMAL_RECT_T * const s, const MMAL_RECT_T * mul_rect, const MMAL_RECT_T * div_rect)
++{
++ d->x = rescale_x(s->x - div_rect->x, mul_rect->width, div_rect->width) + mul_rect->x;
++ d->y = rescale_x(s->y - div_rect->y, mul_rect->height, div_rect->height) + mul_rect->y;
++ d->width = rescale_x(s->width, mul_rect->width, div_rect->width);
++ d->height = rescale_x(s->height, mul_rect->height, div_rect->height);
++#if TRACE_TRANSFORMS
++ fprintf(stderr, "(%d,%d %dx%d) * (%d,%d %dx%d) / (%d,%d %dx%d) -> (%d,%d %dx%d)\n",
++ s->x, s->y, s->width, s->height,
++ mul_rect->x, mul_rect->y, mul_rect->width, mul_rect->height,
++ div_rect->x, div_rect->y, div_rect->width, div_rect->height,
++ d->x, d->y, d->width, d->height);
++#endif
++}
++
++static MMAL_RECT_T
++rect_untransform(MMAL_RECT_T s, const MMAL_RECT_T c, const MMAL_DISPLAYTRANSFORM_T t)
++{
++#if TRACE_TRANSFORMS
++ fprintf(stderr, "t=%d, s=%d,%d:%dx%d, c=%d,%d:%dx%d -> ", (int)t,
++ s.x,s.y,s.width,s.height,
++ c.x,c.y,c.width,c.height);
++#endif
++ if (is_transform_hflip(t))
++ s = rect_hflip(s, c);
++ if (is_transform_vflip(t) != 0)
++ s = rect_vflip(s, c);
++ if (is_transform_transpose(t) != 0)
++ s = rect_transpose(s);
++#if TRACE_TRANSFORMS
++ fprintf(stderr, "s=%d,%d:%dx%d\n",
++ s.x,s.y,s.width,s.height);
++#endif
++ return s;
++}
++
++void hw_mmal_vzc_buf_scale_dest_rect(MMAL_BUFFER_HEADER_T * const buf, const MMAL_RECT_T * const scale_rect, const MMAL_DISPLAYTRANSFORM_T scale_transform)
++{
++ vzc_subbuf_ent_t * sb = buf->user_data;
++ if (scale_rect == NULL) {
++ sb->dreg.dest_rect = sb->orig_dest_rect;
++ sb->dreg.transform = MMAL_DISPLAY_ROT0;
++ }
++ else
++ {
++ // The scale rect has been transposed if we have a transposing
++ // transform - untranspose so we are the same way up as the source
++ const MMAL_RECT_T c = (scale_transform & 4) == 0 ? *scale_rect : rect_transpose(*scale_rect);
++ rescale_rect(&sb->dreg.dest_rect, &sb->orig_dest_rect,
++ &c, &sb->pic_rect);
++ sb->dreg.dest_rect = rect_untransform(sb->dreg.dest_rect, c, scale_transform);
++ sb->dreg.transform = scale_transform;
++ }
++}
++
++unsigned int hw_mmal_vzc_buf_seq(MMAL_BUFFER_HEADER_T * const buf)
++{
++ vzc_subbuf_ent_t * sb = buf->user_data;
++ return sb->ent->seq;
++}
++
++
++// The intent with the ents_cur & ents_last stuff is to remember the buffers
++// we used on the last frame and reuse them on the current one if they are the
++// same. Unfortunately detection of "is_first" is only a heuristic (there are
++// no rules governing the order in which things are blended) so we must deal
++// (fairly) gracefully with it never (or always) being set.
++
++// dst_fmt gives the number space in which the destination pixels are specified
++
++MMAL_BUFFER_HEADER_T * hw_mmal_vzc_buf_from_pic(vzc_pool_ctl_t * const pc,
++ picture_t * const pic,
++ const MMAL_RECT_T dst_pic_rect,
++ const int x_offset, const int y_offset,
++ const unsigned int alpha,
++ const bool is_first)
++{
++ MMAL_BUFFER_HEADER_T * const buf = mmal_queue_get(pc->buf_pool->queue);
++ vzc_subbuf_ent_t * sb;
++
++ if (buf == NULL)
++ return NULL;
++
++ if ((sb = calloc(1, sizeof(*sb))) == NULL)
++ goto fail1;
++
++ // If first or we've had a lot of stuff move everything to the last list
++ // (we could deal more gracefully with the "too many" case but it shouldn't
++ // really happen)
++ if (is_first || pc->ents_cur.n >= CTX_BUFS_MAX) {
++ pool_recycle_list(pc, &pc->ents_prev);
++ ent_list_move(&pc->ents_prev, &pc->ents_cur);
++ }
++
++ sb->dreg.hdr.id = MMAL_PARAMETER_DISPLAYREGION;
++ sb->dreg.hdr.size = sizeof(sb->dreg);
++ buf->user_data = sb;
++
++ {
++ // ?? Round start offset as well as length
++ const video_format_t *const fmt = &pic->format;
++
++ const unsigned int bpp = (fmt->i_bits_per_pixel + 7) >> 3;
++ const unsigned int xl = (fmt->i_x_offset & ~15);
++ const unsigned int xr = (fmt->i_x_offset + fmt->i_visible_width + 15) & ~15;
++ const size_t dst_stride = (xr - xl) * bpp;
++ const size_t dst_lines = ((fmt->i_visible_height + 15) & ~15);
++ const size_t dst_size = dst_stride * dst_lines;
++
++ pool_ent_t * ent = ent_list_extract_pic_ent(&pc->ents_prev, pic);
++ bool needs_copy = false;
++
++ // If we didn't find ent in last then look in cur in case is_first
++ // isn't working
++ if (ent == NULL)
++ ent = ent_list_extract_pic_ent(&pc->ents_cur, pic);
++
++// printf("ent_found: %p\n", ent);
+
+-int mmal_picture_lock(picture_t *picture)
++ if (ent == NULL)
++ {
++ // Need a new ent
++ needs_copy = true;
++
++ if ((ent = pool_best_fit(pc, dst_size)) == NULL)
++ goto fail2;
++ if ((ent->enc_type = vlc_to_mmal_video_fourcc(&pic->format)) == 0)
++ goto fail2;
++
++ ent->pic = picture_Hold(pic);
++ }
++
++ ent_add_head(&pc->ents_cur, ent);
++
++ sb->ent = pool_ent_ref(ent);
++ hw_mmal_vzc_pool_ref(pc);
++
++ // Copy data
++ buf->next = NULL;
++ buf->cmd = 0;
++ buf->data = (uint8_t *)(ent->vc_hdl);
++ buf->alloc_size = buf->length = dst_size;
++ buf->offset = 0;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++ buf->pts = buf->dts = pic->date != VLC_TICK_INVALID ? pic->date : MMAL_TIME_UNKNOWN;
++ buf->type->video = (MMAL_BUFFER_HEADER_VIDEO_SPECIFIC_T){
++ .planes = 1,
++ .pitch = { dst_stride }
++ };
++
++ // Remember offsets
++ sb->dreg.set = MMAL_DISPLAY_SET_SRC_RECT |
++ MMAL_DISPLAY_SET_DEST_RECT |
++ MMAL_DISPLAY_SET_FULLSCREEN |
++ MMAL_DISPLAY_SET_TRANSFORM |
++ MMAL_DISPLAY_SET_ALPHA;
++
++ sb->dreg.fullscreen = 0;
++
++ // Will be set later - zero now to avoid any confusion
++ sb->dreg.transform = MMAL_DISPLAY_ROT0;
++ sb->dreg.dest_rect = (MMAL_RECT_T){0, 0, 0, 0};
++
++ sb->dreg.alpha = (uint32_t)(alpha & 0xff) | MMAL_DISPLAY_ALPHA_FLAGS_MIX;
++
++// printf("+++ bpp:%d, vis:%dx%d wxh:%dx%d, d:%dx%d\n", bpp, fmt->i_visible_width, fmt->i_visible_height, fmt->i_width, fmt->i_height, dst_stride, dst_lines);
++
++ sb->dreg.src_rect = (MMAL_RECT_T){
++ .x = (fmt->i_x_offset - xl),
++ .y = 0,
++ .width = fmt->i_visible_width,
++ .height = fmt->i_visible_height
++ };
++
++ sb->pic_rect = dst_pic_rect;
++
++ sb->orig_dest_rect = (MMAL_RECT_T){
++ .x = x_offset,
++ .y = y_offset,
++ .width = fmt->i_visible_width,
++ .height = fmt->i_visible_height
++ };
++
++ if (needs_copy)
++ {
++ ent->width = dst_stride / bpp;
++ ent->height = dst_lines;
++
++ // 2D copy
++ {
++ uint8_t *d = ent->buf;
++ const uint8_t *s = pic->p[0].p_pixels + xl * bpp + fmt->i_y_offset * pic->p[0].i_pitch;
++
++ mem_copy_2d(d, dst_stride, s, pic->p[0].i_pitch, fmt->i_visible_height, dst_stride);
++
++ // And make sure it is actually in memory
++ if (pc->vcsm_init_type != VCSM_INIT_CMA) { // ** CMA is currently always uncached
++ flush_range(ent->buf, dst_stride * fmt->i_visible_height);
++ }
++ }
++ }
++ }
++
++ return buf;
++
++fail2:
++ free(sb);
++fail1:
++ mmal_buffer_header_release(buf);
++ return NULL;
++}
++
++void hw_mmal_vzc_pool_flush(vzc_pool_ctl_t * const pc)
++{
++ pool_recycle_list(pc, &pc->ents_prev);
++ pool_recycle_list(pc, &pc->ents_cur);
++}
++
++static void hw_mmal_vzc_pool_delete(vzc_pool_ctl_t * const pc)
++{
++
++// printf("<<< %s\n", __func__);
++
++ hw_mmal_vzc_pool_flush(pc);
++
++ ent_free_list(&pc->ent_pool);
++
++ if (pc->buf_pool != NULL)
++ mmal_pool_destroy(pc->buf_pool);
++
++ vlc_mutex_destroy(&pc->lock);
++
++ cma_vcsm_exit(pc->vcsm_init_type);
++
++// memset(pc, 0xba, sizeof(*pc)); // Zap for (hopefully) faster crash
++ free (pc);
++
++ // printf(">>> %s\n", __func__);
++}
++
++void hw_mmal_vzc_pool_release(vzc_pool_ctl_t * const pc)
++{
++ int n;
++
++ if (pc == NULL)
++ return;
++
++ n = atomic_fetch_sub(&pc->ref_count, 1) - 1;
++
++ if (n != 0)
++ return;
++
++ hw_mmal_vzc_pool_delete(pc);
++}
++
++void hw_mmal_vzc_pool_ref(vzc_pool_ctl_t * const pc)
++{
++ atomic_fetch_add(&pc->ref_count, 1);
++}
++
++static MMAL_BOOL_T vcz_pool_release_cb(MMAL_POOL_T * buf_pool, MMAL_BUFFER_HEADER_T *buf, void *userdata)
++{
++ vzc_pool_ctl_t * const pc = userdata;
++ vzc_subbuf_ent_t * const sb = buf->user_data;
++
++ VLC_UNUSED(buf_pool);
++
++// printf("<<< %s\n", __func__);
++
++ if (sb != NULL) {
++ buf->user_data = NULL;
++ pool_recycle(pc, sb->ent);
++ hw_mmal_vzc_pool_release(pc);
++ free(sb);
++ }
++
++// printf(">>> %s\n", __func__);
++
++ return MMAL_TRUE;
++}
++
++vzc_pool_ctl_t * hw_mmal_vzc_pool_new()
++{
++ vzc_pool_ctl_t * const pc = calloc(1, sizeof(*pc));
++
++ if (pc == NULL)
++ return NULL;
++
++ if ((pc->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE)
++ {
++ free(pc);
++ return NULL;
++ }
++
++ pc->max_n = 8;
++ vlc_mutex_init(&pc->lock); // Must init before potential destruction
++
++ if ((pc->buf_pool = mmal_pool_create(64, 0)) == NULL)
++ {
++ hw_mmal_vzc_pool_delete(pc);
++ return NULL;
++ }
++
++ atomic_store(&pc->ref_count, 1);
++
++ mmal_pool_callback_set(pc->buf_pool, vcz_pool_release_cb, pc);
++
++ return pc;
++}
++
++//----------------------------------------------------------------------------
++
++
++static const uint8_t shift_00[] = {0,0,0,0};
++static const uint8_t shift_01[] = {0,1,1,1};
++
++int cma_pic_set_data(picture_t * const pic,
++ const MMAL_ES_FORMAT_T * const mm_esfmt,
++ const MMAL_BUFFER_HEADER_T * const buf)
+ {
+- picture_sys_t *pic_sys = picture->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer = pic_sys->buffer;
++ const MMAL_VIDEO_FORMAT_T * const mm_fmt = &mm_esfmt->es->video;
++ const MMAL_BUFFER_HEADER_VIDEO_SPECIFIC_T *const buf_vid = (buf == NULL) ? NULL : &buf->type->video;
++ cma_buf_t *const cb = cma_buf_pic_get(pic);
++ unsigned int planes = 1;
++
++ uint8_t * const data = cma_buf_addr(cb);
++ if (data == NULL) {
++ return VLC_ENOMEM;
++ }
++
++ const uint8_t * ws = shift_00;
++ const uint8_t * hs = shift_00;
++ int pb = 1;
++
++ switch (mm_esfmt->encoding)
++ {
++ case MMAL_ENCODING_ARGB:
++ case MMAL_ENCODING_ABGR:
++ case MMAL_ENCODING_RGBA:
++ case MMAL_ENCODING_BGRA:
++ case MMAL_ENCODING_RGB32:
++ case MMAL_ENCODING_BGR32:
++ pb = 4;
++ break;
++ case MMAL_ENCODING_RGB16:
++ pb = 2;
++ break;
+
+- int offset = 0;
+- picture->p[0].p_pixels = buffer->data;
+- for (int i = 1; i < picture->i_planes; i++) {
+- offset = offset + picture->p[i - 1].i_pitch * picture->p[i - 1].i_lines;
+- picture->p[i].p_pixels = (ptrdiff_t)buffer->data + offset;
++ case MMAL_ENCODING_I420:
++ ws = shift_01;
++ hs = shift_01;
++ planes = 3;
++ break;
++
++ case MMAL_ENCODING_YUVUV128:
++ hs = shift_01;
++ planes = 2;
++ break;
++
++ default:
++// msg_Err(p_filter, "%s: Unexpected format", __func__);
++ return VLC_EGENERIC;
+ }
+
+- pic_sys->displayed = false;
++ // Fix up SAR if unset
++ if (pic->format.i_sar_den == 0 || pic->format.i_sar_num == 0) {
++ pic->format.i_sar_den = mm_fmt->par.den;
++ pic->format.i_sar_num = mm_fmt->par.num;
++ }
+
++ pic->i_planes = planes;
++ unsigned int offset = 0;
++ for (unsigned int i = 0; i != planes; ++i) {
++ pic->p[i] = (plane_t){
++ .p_pixels = data + (buf_vid != NULL ? buf_vid->offset[i] : offset),
++ .i_lines = mm_fmt->height >> hs[i],
++ .i_pitch = buf_vid != NULL ? buf_vid->pitch[i] : mm_fmt->width * pb,
++ .i_pixel_pitch = pb,
++ .i_visible_lines = mm_fmt->crop.height >> hs[i],
++ .i_visible_pitch = mm_fmt->crop.width >> ws[i]
++ };
++ offset += pic->p[i].i_pitch * pic->p[i].i_lines;
++ }
+ return VLC_SUCCESS;
+ }
++
++int cma_buf_pic_attach(cma_buf_t * const cb, picture_t * const pic)
++{
++ if (!is_cma_buf_pic_chroma(pic->format.i_chroma))
++ return VLC_EGENERIC;
++ if (pic->context != NULL)
++ return VLC_EBADVAR;
++
++ pic_ctx_mmal_t * const ctx = calloc(1, sizeof(pic_ctx_mmal_t));
++
++ if (ctx == NULL)
++ return VLC_ENOMEM;
++
++ ctx->cmn.copy = hw_mmal_pic_ctx_copy;
++ ctx->cmn.destroy = hw_mmal_pic_ctx_destroy;
++ ctx->buf_count = 1; // cb takes the place of the 1st buf
++ ctx->cb = cb;
++
++ cma_buf_in_flight(cb);
++
++ pic->context = &ctx->cmn;
++ return VLC_SUCCESS;
++}
++
++cma_buf_t * cma_buf_pic_get(picture_t * const pic)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++ return !is_cma_buf_pic_chroma(pic->format.i_chroma) || ctx == NULL ? 0 : ctx->cb;
++}
++
++
++//----------------------------------------------------------------------------
++
++/* Returns the type of the Pi being used
++*/
++bool rpi_is_model_pi4(void) {
++ return bcm_host_is_model_pi4();
++}
++
++// Preferred mode - none->cma on Pi4 otherwise legacy
++static volatile vcsm_init_type_t last_vcsm_type = VCSM_INIT_NONE;
++
++vcsm_init_type_t cma_vcsm_type(void)
++{
++ return last_vcsm_type;
++}
++
++vcsm_init_type_t cma_vcsm_init(void)
++{
++ vcsm_init_type_t rv = VCSM_INIT_NONE;
++ // We don't bother locking - taking a copy here should be good enough
++ vcsm_init_type_t try_type = last_vcsm_type;
++
++ if (try_type == VCSM_INIT_NONE) {
++ if (bcm_host_is_fkms_active())
++ try_type = VCSM_INIT_CMA;
++ else
++ try_type = VCSM_INIT_LEGACY;
++ }
++
++ if (try_type == VCSM_INIT_CMA) {
++ if (vcsm_init_ex(1, -1) == 0)
++ rv = VCSM_INIT_CMA;
++ else if (vcsm_init_ex(0, -1) == 0)
++ rv = VCSM_INIT_LEGACY;
++ }
++ else
++ {
++ if (vcsm_init_ex(0, -1) == 0)
++ rv = VCSM_INIT_LEGACY;
++ else if (vcsm_init_ex(1, -1) == 0)
++ rv = VCSM_INIT_CMA;
++ }
++
++ // Just in case this affects vcsm init do after that
++ if (rv != VCSM_INIT_NONE)
++ bcm_host_init();
++
++ last_vcsm_type = rv;
++ return rv;
++}
++
++void cma_vcsm_exit(const vcsm_init_type_t init_mode)
++{
++ if (init_mode != VCSM_INIT_NONE)
++ {
++ vcsm_exit();
++ bcm_host_deinit(); // Does nothing but add in case it ever does
++ }
++}
++
++const char * cma_vcsm_init_str(const vcsm_init_type_t init_mode)
++{
++ switch (init_mode)
++ {
++ case VCSM_INIT_CMA:
++ return "CMA";
++ case VCSM_INIT_LEGACY:
++ return "Legacy";
++ case VCSM_INIT_NONE:
++ return "none";
++ default:
++ break;
++ }
++ return "???";
++}
++
++
+--- a/modules/hw/mmal/mmal_picture.h
++++ b/modules/hw/mmal/mmal_picture.h
+@@ -24,19 +24,298 @@
+ #ifndef VLC_MMAL_MMAL_PICTURE_H_
+ #define VLC_MMAL_MMAL_PICTURE_H_
+
++#include <stdatomic.h>
++
+ #include <vlc_common.h>
+ #include <interface/mmal/mmal.h>
+
++#include "mmal_cma.h"
++
+ /* Think twice before changing this. Incorrect values cause havoc. */
+ #define NUM_ACTUAL_OPAQUE_BUFFERS 30
+
+-struct picture_sys_t {
+- vlc_object_t *owner;
++#ifndef VLC_TICK_INVALID
++#define VLC_TICK_INVALID VLC_TS_INVALID
++#define VLC_VER_3 1
++#else
++#define VLC_VER_3 0
++#endif
++
++typedef struct mmal_port_pool_ref_s
++{
++ atomic_uint refs;
++ MMAL_POOL_T * pool;
++ MMAL_PORT_T * port;
++} hw_mmal_port_pool_ref_t;
++
++typedef struct pic_ctx_subpic_s {
++ picture_t * subpic;
++ int x, y;
++ int alpha;
++} pic_ctx_subpic_t;
++
++
++#define CTX_BUFS_MAX 4
++typedef struct pic_ctx_mmal_s {
++ picture_context_t cmn; // PARENT: Common els at start
++
++ cma_buf_t * cb;
++
++ unsigned int buf_count;
++ MMAL_BUFFER_HEADER_T * bufs[CTX_BUFS_MAX];
++
++} pic_ctx_mmal_t;
++
++const char * str_fourcc(char * const buf, const unsigned int fcc);
++
++MMAL_FOURCC_T vlc_to_mmal_video_fourcc(const video_frame_format_t * const vf_vlc);
++MMAL_FOURCC_T vlc_to_mmal_color_space(const video_color_space_t vlc_cs);
++void hw_mmal_vlc_fmt_to_mmal_fmt(MMAL_ES_FORMAT_T *const es_fmt, const video_frame_format_t * const vf_vlc);
++// Returns true if fmt_changed
++// frame_rate ignored for compare, but is set if something else is updated
++bool hw_mmal_vlc_pic_to_mmal_fmt_update(MMAL_ES_FORMAT_T *const es_fmt, const picture_t * const pic);
++
++// Copy pic contents into an existing buffer
++int hw_mmal_copy_pic_to_buf(void * const buf_data, uint32_t * const pLength,
++ const MMAL_ES_FORMAT_T * const fmt, const picture_t * const pic);
++
++hw_mmal_port_pool_ref_t * hw_mmal_port_pool_ref_create(MMAL_PORT_T * const port,
++ const unsigned int headers, const uint32_t payload_size);
++void hw_mmal_port_pool_ref_release(hw_mmal_port_pool_ref_t * const ppr, const bool in_cb);
++bool hw_mmal_port_pool_ref_recycle(hw_mmal_port_pool_ref_t * const ppr, MMAL_BUFFER_HEADER_T * const buf);
++MMAL_STATUS_T hw_mmal_port_pool_ref_fill(hw_mmal_port_pool_ref_t * const ppr);
++static inline void hw_mmal_port_pool_ref_acquire(hw_mmal_port_pool_ref_t * const ppr)
++{
++ atomic_fetch_add(&ppr->refs, 1);
++}
++MMAL_STATUS_T hw_mmal_opaque_output(vlc_object_t * const obj,
++ hw_mmal_port_pool_ref_t ** pppr,
++ MMAL_PORT_T * const port,
++ const unsigned int extra_buffers, MMAL_PORT_BH_CB_T callback);
++
++static inline int hw_mmal_pic_has_sub_bufs(picture_t * const pic)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++ return ctx->buf_count > 1;
++}
++
++static inline void hw_mmal_pic_sub_buf_add(picture_t * const pic, MMAL_BUFFER_HEADER_T * const sub)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++
++ if (ctx->buf_count >= CTX_BUFS_MAX) {
++ mmal_buffer_header_release(sub);
++ return;
++ }
++
++ ctx->bufs[ctx->buf_count++] = sub;
++}
++
++static inline MMAL_BUFFER_HEADER_T * hw_mmal_pic_sub_buf_get(picture_t * const pic, const unsigned int n)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++
++ return n + 1 > ctx->buf_count ? NULL : ctx->bufs[n + 1];
++}
++
++static inline bool hw_mmal_chroma_is_mmal(const vlc_fourcc_t chroma)
++{
++ return
++ chroma == VLC_CODEC_MMAL_OPAQUE ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND8 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ chroma == VLC_CODEC_MMAL_ZC_RGB32;
++}
++
++static inline bool hw_mmal_pic_is_mmal(const picture_t * const pic)
++{
++ return hw_mmal_chroma_is_mmal(pic->format.i_chroma);
++}
++
++picture_context_t * hw_mmal_pic_ctx_copy(picture_context_t * pic_ctx_cmn);
++void hw_mmal_pic_ctx_destroy(picture_context_t * pic_ctx_cmn);
++picture_context_t * hw_mmal_gen_context(
++ MMAL_BUFFER_HEADER_T * buf, hw_mmal_port_pool_ref_t * const ppr);
++
++int hw_mmal_get_gpu_mem(void);
++
++
++static inline MMAL_STATUS_T port_parameter_set_uint32(MMAL_PORT_T * port, uint32_t id, uint32_t val)
++{
++ const MMAL_PARAMETER_UINT32_T param = {
++ .hdr = {.id = id, .size = sizeof(MMAL_PARAMETER_UINT32_T)},
++ .value = val
++ };
++ return mmal_port_parameter_set(port, &param.hdr);
++}
++
++static inline MMAL_STATUS_T port_parameter_set_bool(MMAL_PORT_T * const port, const uint32_t id, const bool val)
++{
++ const MMAL_PARAMETER_BOOLEAN_T param = {
++ .hdr = {.id = id, .size = sizeof(MMAL_PARAMETER_BOOLEAN_T)},
++ .enable = val
++ };
++ return mmal_port_parameter_set(port, &param.hdr);
++}
++
++static inline MMAL_STATUS_T port_send_replicated(MMAL_PORT_T * const port, MMAL_POOL_T * const rep_pool,
++ MMAL_BUFFER_HEADER_T * const src_buf,
++ const uint64_t seq)
++{
++ MMAL_STATUS_T err;
++ MMAL_BUFFER_HEADER_T *const rep_buf = mmal_queue_wait(rep_pool->queue);
++
++ if (rep_buf == NULL)
++ return MMAL_ENOSPC;
++
++ if ((err = mmal_buffer_header_replicate(rep_buf, src_buf)) != MMAL_SUCCESS)
++ return err;
++
++ rep_buf->pts = seq;
++
++ if ((err = mmal_port_send_buffer(port, rep_buf)) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(rep_buf);
++ return err;
++ }
++
++ return MMAL_SUCCESS;
++}
++
++
++static inline void pic_to_buf_copy_props(MMAL_BUFFER_HEADER_T * const buf, const picture_t * const pic)
++{
++ if (!pic->b_progressive)
++ {
++ buf->flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ buf->type->video.flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ }
++ else
++ {
++ buf->flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ buf->type->video.flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ }
++ if (pic->b_top_field_first)
++ {
++ buf->flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ buf->type->video.flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ }
++ else
++ {
++ buf->flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ buf->type->video.flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ }
++ buf->pts = pic->date != VLC_TICK_INVALID ? pic->date : MMAL_TIME_UNKNOWN;
++ buf->dts = buf->pts;
++}
++
++static inline void buf_to_pic_copy_props(picture_t * const pic, const MMAL_BUFFER_HEADER_T * const buf)
++{
++ // Contrary to docn the interlace & tff flags turn up in the header flags rather than the
++ // video specific flags (which appear to be currently unused).
++ pic->b_progressive = (buf->flags & MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED) == 0;
++ pic->b_top_field_first = (buf->flags & MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST) != 0;
++
++ pic->date = buf->pts != MMAL_TIME_UNKNOWN ? buf->pts :
++ buf->dts != MMAL_TIME_UNKNOWN ? buf->dts :
++ VLC_TICK_INVALID;
++}
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_copied(const picture_t *const pic,
++ MMAL_POOL_T * const rep_pool,
++ MMAL_PORT_T * const port,
++ cma_buf_pool_t * const cbp);
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_replicated(const picture_t *const pic, MMAL_POOL_T * const rep_pool);
++
++struct vzc_pool_ctl_s;
++typedef struct vzc_pool_ctl_s vzc_pool_ctl_t;
++
++// At the moment we cope with any mono-planar RGBA thing
++// We could cope with many other things but they currently don't occur
++extern const vlc_fourcc_t hw_mmal_vzc_subpicture_chromas[];
++static inline bool hw_mmal_vzc_subpic_fmt_valid(const video_frame_format_t * const vf_vlc)
++{
++ const vlc_fourcc_t vfcc_src = vf_vlc->i_chroma;
++ for (const vlc_fourcc_t * p = hw_mmal_vzc_subpicture_chromas; *p != 0; ++p)
++ if (*p == vfcc_src)
++ return true;
++
++ return false;
++}
++
++bool hw_mmal_vzc_buf_set_format(MMAL_BUFFER_HEADER_T * const buf, MMAL_ES_FORMAT_T * const es_fmt);
++MMAL_DISPLAYREGION_T * hw_mmal_vzc_buf_region(MMAL_BUFFER_HEADER_T * const buf);
++void hw_mmal_vzc_buf_scale_dest_rect(MMAL_BUFFER_HEADER_T * const buf, const MMAL_RECT_T * const scale_rect, const MMAL_DISPLAYTRANSFORM_T scale_transform);
++void hw_mmal_vzc_buf_get_wh(MMAL_BUFFER_HEADER_T * const buf, int * const pW, int * const pH);
++unsigned int hw_mmal_vzc_buf_seq(MMAL_BUFFER_HEADER_T * const buf);
++MMAL_BUFFER_HEADER_T * hw_mmal_vzc_buf_from_pic(vzc_pool_ctl_t * const pc, picture_t * const pic,
++ const MMAL_RECT_T dst_pic_rect,
++ const int x_offset, const int y_offset,
++ const unsigned int alpha, const bool is_first);
++void hw_mmal_vzc_buf_frame_size(MMAL_BUFFER_HEADER_T * const buf,
++ uint32_t * const pWidth, uint32_t * const pHeight);
++
++void hw_mmal_vzc_pool_flush(vzc_pool_ctl_t * const pc);
++void hw_mmal_vzc_pool_release(vzc_pool_ctl_t * const pc);
++void hw_mmal_vzc_pool_ref(vzc_pool_ctl_t * const pc);
++vzc_pool_ctl_t * hw_mmal_vzc_pool_new(void);
++
++
++static inline MMAL_RECT_T vis_mmal_rect(const video_format_t * const fmt)
++{
++ return (MMAL_RECT_T){
++ .x = fmt->i_x_offset,
++ .y = fmt->i_y_offset,
++ .width = fmt->i_visible_width,
++ .height = fmt->i_visible_height
++ };
++}
++
++int cma_pic_set_data(picture_t * const pic,
++ const MMAL_ES_FORMAT_T * const mm_esfmt,
++ const MMAL_BUFFER_HEADER_T * const buf);
++
++// Attaches cma buf to pic
++// Marks in_flight if not all_in_flight anyway
++int cma_buf_pic_attach(cma_buf_t * const cb, picture_t * const pic);
++// Returns a pointer to the cma_buf attached to the pic
++// Just a pointer - doesn't add a ref
++cma_buf_t * cma_buf_pic_get(picture_t * const pic);
++
++static inline bool is_cma_buf_pic_chroma(const uint32_t chroma)
++{
++ return chroma == VLC_CODEC_MMAL_ZC_RGB32 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND8 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ chroma == VLC_CODEC_MMAL_ZC_I420;
++}
++
++
++int rpi_get_model_type(void);
++bool rpi_is_model_pi4(void);
++bool rpi_is_fkms_active(void);
++
++typedef enum vcsm_init_type_e {
++ VCSM_INIT_NONE = 0,
++ VCSM_INIT_LEGACY,
++ VCSM_INIT_CMA
++} vcsm_init_type_t;
++
++vcsm_init_type_t cma_vcsm_init(void);
++void cma_vcsm_exit(const vcsm_init_type_t init_mode);
++vcsm_init_type_t cma_vcsm_type(void);
++const char * cma_vcsm_init_str(const vcsm_init_type_t init_mode);
++
+
+- MMAL_BUFFER_HEADER_T *buffer;
+- bool displayed;
+-};
++#define VOUT_DISPLAY_CHANGE_MMAL_BASE 1024
++#define VOUT_DISPLAY_CHANGE_MMAL_HIDE (VOUT_DISPLAY_CHANGE_MMAL_BASE + 0)
+
+-int mmal_picture_lock(picture_t *picture);
++#define MMAL_COMPONENT_DEFAULT_RESIZER "vc.ril.resize"
++#define MMAL_COMPONENT_ISP_RESIZER "vc.ril.isp"
++#define MMAL_COMPONENT_HVS "vc.ril.hvs"
+
+ #endif
+--- /dev/null
++++ b/modules/hw/mmal/rpi_prof.h
+@@ -0,0 +1,110 @@
++#ifndef RPI_PROFILE_H
++#define RPI_PROFILE_H
++
++#include <stdint.h>
++#include <inttypes.h>
++
++#ifndef RPI_PROFILE
++#define RPI_PROFILE 0
++#endif
++
++#if RPI_PROFILE
++
++#include "v7_pmu.h"
++
++#ifdef RPI_PROC_ALLOC
++#define X volatile
++#define Z =0
++#else
++#define X extern volatile
++#define Z
++#endif
++
++X uint64_t av_rpi_prof0_cycles Z;
++X unsigned int av_rpi_prof0_cnt Z;
++#define RPI_prof0_MAX_DURATION 100000
++
++X uint64_t av_rpi_prof1_cycles Z;
++X unsigned int av_rpi_prof1_cnt Z;
++#define RPI_prof1_MAX_DURATION 100000
++
++X uint64_t av_rpi_prof2_cycles Z;
++X unsigned int av_rpi_prof2_cnt Z;
++#define RPI_prof2_MAX_DURATION 10000
++
++X uint64_t av_rpi_prof_n_cycles[128];
++X unsigned int av_rpi_prof_n_cnt[128];
++#define RPI_prof_n_MAX_DURATION 10000
++
++
++#undef X
++#undef Z
++
++#define PROFILE_INIT()\
++do {\
++ enable_pmu();\
++ enable_ccnt();\
++} while (0)
++
++#define PROFILE_START()\
++do {\
++ volatile uint32_t perf_1 = read_ccnt();\
++ volatile uint32_t perf_2
++
++
++#define PROFILE_ACC(x)\
++ perf_2 = read_ccnt();\
++ {\
++ const uint32_t duration = perf_2 - perf_1;\
++ if (duration < RPI_##x##_MAX_DURATION)\
++ {\
++ av_rpi_##x##_cycles += duration;\
++ av_rpi_##x##_cnt += 1;\
++ }\
++ }\
++} while(0)
++
++
++#define PROFILE_ACC_N(n)\
++ if ((n) >= 0) {\
++ perf_2 = read_ccnt();\
++ {\
++ const uint32_t duration = perf_2 - perf_1;\
++ if (duration < RPI_prof_n_MAX_DURATION)\
++ {\
++ av_rpi_prof_n_cycles[n] += duration;\
++ av_rpi_prof_n_cnt[n] += 1;\
++ }\
++ }\
++ }\
++} while(0)
++
++#define PROFILE_PRINTF(x)\
++ printf("%-20s cycles=%14" PRIu64 "; cnt=%8u; avg=%5" PRIu64 "\n", #x, av_rpi_##x##_cycles, av_rpi_##x##_cnt,\
++ av_rpi_##x##_cnt == 0 ? (uint64_t)0 : av_rpi_##x##_cycles / (uint64_t)av_rpi_##x##_cnt)
++
++#define PROFILE_PRINTF_N(n)\
++ printf("prof[%d] cycles=%14" PRIu64 "; cnt=%8u; avg=%5" PRIu64 "\n", (n), av_rpi_prof_n_cycles[n], av_rpi_prof_n_cnt[n],\
++ av_rpi_prof_n_cnt[n] == 0 ? (uint64_t)0 : av_rpi_prof_n_cycles[n] / (uint64_t)av_rpi_prof_n_cnt[n])
++
++#define PROFILE_CLEAR_N(n) \
++do {\
++ av_rpi_prof_n_cycles[n] = 0;\
++ av_rpi_prof_n_cnt[n] = 0;\
++} while(0)
++
++#else
++
++// No profile
++#define PROFILE_INIT()
++#define PROFILE_START()
++#define PROFILE_ACC(x)
++#define PROFILE_ACC_N(x)
++#define PROFILE_PRINTF(x)
++#define PROFILE_PRINTF_N(x)
++#define PROFILE_CLEAR_N(n)
++
++#endif
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/subpic.c
+@@ -0,0 +1,257 @@
++/*****************************************************************************
++ * mmal.c: MMAL-based decoder plugin for Raspberry Pi
++ *****************************************************************************
++ * Authors: jc@kynesim.co.uk
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU Lesser General Public License as published by
++ * the Free Software Foundation; either version 2.1 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public License
++ * along with this program; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
++ *****************************************************************************/
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <stdatomic.h>
++
++#include <vlc_common.h>
++#include <vlc_plugin.h>
++#include <vlc_codec.h>
++#include <vlc_filter.h>
++#include <vlc_threads.h>
++
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++
++#include "mmal_picture.h"
++#include "subpic.h"
++
++
++#define TRACE_ALL 0
++
++static inline bool cmp_rect(const MMAL_RECT_T * const a, const MMAL_RECT_T * const b)
++{
++ return a->x == b->x && a->y == b->y && a->width == b->width && a->height == b->height;
++}
++
++void hw_mmal_subpic_flush(vlc_object_t * const p_filter, subpic_reg_stash_t * const sub)
++{
++ VLC_UNUSED(p_filter);
++ if (sub->port != NULL && sub->port->is_enabled)
++ mmal_port_disable(sub->port);
++ sub->seq = 0;
++}
++
++void hw_mmal_subpic_close(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe)
++{
++ hw_mmal_subpic_flush(p_filter, spe);
++
++ if (spe->pool != NULL)
++ mmal_pool_destroy(spe->pool);
++
++ // Zap to avoid any accidental reuse
++ *spe = (subpic_reg_stash_t){NULL};
++}
++
++MMAL_STATUS_T hw_mmal_subpic_open(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe, MMAL_PORT_T * const port,
++ const int display_id, const unsigned int layer)
++{
++ MMAL_STATUS_T err;
++
++ // Start by zapping all to zero
++ *spe = (subpic_reg_stash_t){NULL};
++
++ if ((err = port_parameter_set_bool(port, MMAL_PARAMETER_ZERO_COPY, true)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to set sub port zero copy");
++ return err;
++ }
++
++ if ((spe->pool = mmal_pool_create(30, 0)) == NULL)
++ {
++ msg_Err(p_filter, "Failed to create sub pool");
++ return MMAL_ENOMEM;
++ }
++
++ port->userdata = (void *)p_filter;
++ spe->port = port;
++ spe->display_id = display_id;
++ spe->layer = layer;
++
++ return MMAL_SUCCESS;
++}
++
++static void conv_subpic_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++#if TRACE_ALL
++ msg_Dbg((filter_t *)port->userdata, "<<< %s cmd=%d, user=%p, buf=%p, flags=%#x, len=%d/%d, pts=%lld",
++ __func__, buf->cmd, buf->user_data, buf, buf->flags, buf->length, buf->alloc_size, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf); // Will extract & release pic in pool callback
++}
++
++static int
++subpic_send_empty(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe, const uint64_t pts)
++{
++ MMAL_BUFFER_HEADER_T *const buf = mmal_queue_wait(spe->pool->queue);
++ MMAL_STATUS_T err;
++
++ if (buf == NULL) {
++ msg_Err(p_filter, "Buffer get for subpic failed");
++ return -1;
++ }
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Remove pic for sub %d", spe->seq);
++#endif
++ buf->cmd = 0;
++ buf->data = NULL;
++ buf->alloc_size = 0;
++ buf->offset = 0;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++ buf->pts = pts;
++ buf->dts = MMAL_TIME_UNKNOWN;
++ buf->user_data = NULL;
++
++ if ((err = mmal_port_send_buffer(spe->port, buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to subput failed");
++ mmal_buffer_header_release(buf);
++ return -1;
++ }
++ return 0;
++}
++
++// < 0 Error
++// 0 Done & stop
++// 1 Done & continue
++
++int hw_mmal_subpic_update(vlc_object_t * const p_filter,
++ MMAL_BUFFER_HEADER_T * const sub_buf,
++ subpic_reg_stash_t * const spe,
++ const video_format_t * const fmt,
++ const MMAL_RECT_T * const scale_out,
++ const MMAL_DISPLAYTRANSFORM_T transform_out,
++ const uint64_t pts)
++{
++ MMAL_STATUS_T err;
++
++ if (sub_buf == NULL)
++ {
++ if (spe->port->is_enabled && spe->seq != 0)
++ {
++ subpic_send_empty(p_filter, spe, pts);
++ spe->seq = 0;
++ }
++ }
++ else
++ {
++ const unsigned int seq = hw_mmal_vzc_buf_seq(sub_buf);
++ bool needs_update = (spe->seq != seq);
++
++ hw_mmal_vzc_buf_scale_dest_rect(sub_buf, scale_out, transform_out);
++
++ if (hw_mmal_vzc_buf_set_format(sub_buf, spe->port->format))
++ {
++ MMAL_DISPLAYREGION_T * const dreg = hw_mmal_vzc_buf_region(sub_buf);
++ MMAL_VIDEO_FORMAT_T *const v_fmt = &spe->port->format->es->video;
++
++ v_fmt->frame_rate.den = fmt->i_frame_rate_base;
++ v_fmt->frame_rate.num = fmt->i_frame_rate;
++ v_fmt->par.den = fmt->i_sar_den;
++ v_fmt->par.num = fmt->i_sar_num;
++ v_fmt->color_space = MMAL_COLOR_SPACE_UNKNOWN;
++
++ if (needs_update || dreg->alpha != spe->alpha || !cmp_rect(&dreg->dest_rect, &spe->dest_rect)) {
++
++ spe->alpha = dreg->alpha;
++ spe->dest_rect = dreg->dest_rect;
++ needs_update = true;
++
++ if (spe->display_id >= 0)
++ {
++ dreg->display_num = spe->display_id;
++ dreg->set |= MMAL_DISPLAY_SET_NUM;
++ }
++ dreg->layer = spe->layer;
++ dreg->set |= MMAL_DISPLAY_SET_LAYER;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: Update region: Set=%x, dest=%dx%d @ (%d,%d), src=%dx%d @ (%d,%d), layer=%d, alpha=%#x",
++ __func__, dreg->set,
++ dreg->dest_rect.width, dreg->dest_rect.height, dreg->dest_rect.x, dreg->dest_rect.y,
++ dreg->src_rect.width, dreg->src_rect.height, dreg->src_rect.x, dreg->src_rect.y,
++ dreg->layer, dreg->alpha);
++#endif
++
++ // If now completely offscreen just flush this & return
++ // We only do -ve as (a) that is easy and (b) it seems to be
++ // something that can confuse mmal
++ if (dreg->dest_rect.y + dreg->dest_rect.height <= 0 ||
++ dreg->dest_rect.x + dreg->dest_rect.width <= 0)
++ {
++ if (spe->port->is_enabled)
++ subpic_send_empty(p_filter, spe, pts);
++ spe->seq = seq;
++ return 1;
++ }
++
++ if ((err = mmal_port_parameter_set(spe->port, &dreg->hdr)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Set display region on subput failed");
++ return -1;
++ }
++
++ if ((err = mmal_port_format_commit(spe->port)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(p_filter, "%s: Subpic commit fail: %d", __func__, err);
++ return -1;
++ }
++ }
++ }
++
++ if (!spe->port->is_enabled)
++ {
++ spe->port->buffer_num = 30;
++ spe->port->buffer_size = spe->port->buffer_size_recommended; // Not used but shuts up the error checking
++
++ if ((err = mmal_port_enable(spe->port, conv_subpic_cb)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(p_filter, "%s: Subpic enable fail: %d", __func__, err);
++ return -1;
++ }
++ }
++
++ if (needs_update)
++ {
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Update pic for sub %d", spe->seq);
++#endif
++ if ((err = port_send_replicated(spe->port, spe->pool, sub_buf, pts)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to subput failed");
++ return -1;
++ }
++
++ spe->seq = seq;
++ }
++ }
++ return 1;
++}
++
++
++
+--- /dev/null
++++ b/modules/hw/mmal/subpic.h
+@@ -0,0 +1,33 @@
++#ifndef VLC_HW_MMAL_SUBPIC_H_
++#define VLC_HW_MMAL_SUBPIC_H_
++
++typedef struct subpic_reg_stash_s
++{
++ MMAL_PORT_T * port;
++ MMAL_POOL_T * pool;
++ int display_id; // -1 => do not set
++ unsigned int layer;
++ // Shadow vars so we can tell if stuff has changed
++ MMAL_RECT_T dest_rect;
++ unsigned int alpha;
++ unsigned int seq;
++} subpic_reg_stash_t;
++
++int hw_mmal_subpic_update(vlc_object_t * const p_filter,
++ MMAL_BUFFER_HEADER_T * const sub_buf,
++ subpic_reg_stash_t * const spe,
++ const video_format_t * const fmt,
++ const MMAL_RECT_T * const scale_out,
++ const MMAL_DISPLAYTRANSFORM_T transform_out,
++ const uint64_t pts);
++
++void hw_mmal_subpic_flush(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe);
++
++void hw_mmal_subpic_close(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe);
++
++// If display id is -1 it will be unset
++MMAL_STATUS_T hw_mmal_subpic_open(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe, MMAL_PORT_T * const port,
++ const int display_id, const unsigned int layer);
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/transform_ops.h
+@@ -0,0 +1,99 @@
++#ifndef VLC_MMAL_TRANSFORM_OPS_H
++#define VLC_MMAL_TRANSFORM_OPS_H
++
++#include <vlc_common.h>
++#include <vlc_picture.h>
++#include <interface/mmal/mmal.h>
++
++
++// These are enums with the same order so simply coerce
++static inline MMAL_DISPLAYTRANSFORM_T vlc_to_mmal_transform(const video_orientation_t orientation){
++ return (MMAL_DISPLAYTRANSFORM_T)orientation;
++}
++
++// MMAL headers comment these (getting 2 a bit wrong) but do not give
++// defines
++#define XFORM_H_SHIFT 0 // Hflip
++#define XFORM_V_SHIFT 1 // Vflip
++#define XFORM_T_SHIFT 2 // Transpose
++#define XFORM_H_BIT (1 << XFORM_H_SHIFT)
++#define XFORM_V_BIT (1 << XFORM_V_SHIFT)
++#define XFORM_T_BIT (1 << XFORM_T_SHIFT)
++
++static inline bool
++is_transform_transpose(const MMAL_DISPLAYTRANSFORM_T t)
++{
++ return ((unsigned int)t & XFORM_T_BIT) != 0;
++}
++
++static inline bool
++is_transform_hflip(const MMAL_DISPLAYTRANSFORM_T t)
++{
++ return ((unsigned int)t & XFORM_H_BIT) != 0;
++}
++
++static inline bool
++is_transform_vflip(const MMAL_DISPLAYTRANSFORM_T t)
++{
++ return ((unsigned int)t & XFORM_V_BIT) != 0;
++}
++
++static inline MMAL_DISPLAYTRANSFORM_T
++swap_transform_hv(const MMAL_DISPLAYTRANSFORM_T x)
++{
++ return (((x >> XFORM_H_SHIFT) & 1) << XFORM_V_SHIFT) |
++ (((x >> XFORM_V_SHIFT) & 1) << XFORM_H_SHIFT) |
++ (x & XFORM_T_BIT);
++}
++
++static inline MMAL_DISPLAYTRANSFORM_T
++transform_inverse(const MMAL_DISPLAYTRANSFORM_T x)
++{
++ return is_transform_transpose(x) ? swap_transform_hv(x) : x;
++}
++
++// Transform generated by A then B
++// All ops are self inverse so can simply be XORed on their own
++// H & V flips after a transpose need to be swapped
++static inline MMAL_DISPLAYTRANSFORM_T
++combine_transform(const MMAL_DISPLAYTRANSFORM_T a, const MMAL_DISPLAYTRANSFORM_T b)
++{
++ return a ^ (is_transform_transpose(a) ? swap_transform_hv(b) : b);
++}
++
++static inline MMAL_RECT_T
++rect_transpose(const MMAL_RECT_T s)
++{
++ return (MMAL_RECT_T){
++ .x = s.y,
++ .y = s.x,
++ .width = s.height,
++ .height = s.width
++ };
++}
++
++// hflip s in c
++static inline MMAL_RECT_T rect_hflip(const MMAL_RECT_T s, const MMAL_RECT_T c)
++{
++ return (MMAL_RECT_T){
++ .x = c.x + (c.x + c.width) - (s.x + s.width),
++ .y = s.y,
++ .width = s.width,
++ .height = s.height
++ };
++}
++
++// vflip s in c
++static inline MMAL_RECT_T rect_vflip(const MMAL_RECT_T s, const MMAL_RECT_T c)
++{
++ return (MMAL_RECT_T){
++ .x = s.x,
++ .y = (c.y + c.height) - (s.y - c.y) - s.height,
++ .width = s.width,
++ .height = s.height
++ };
++}
++
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/v7_pmu.S
+@@ -0,0 +1,263 @@
++/*------------------------------------------------------------
++Performance Monitor Block
++------------------------------------------------------------*/
++ .arm @ Make sure we are in ARM mode.
++ .text
++ .align 2
++ .global getPMN @ export this function for the linker
++
++/* Returns the number of progammable counters uint32_t getPMN(void) */
++
++getPMN:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC Register */
++ MOV r0, r0, LSR #11 /* Shift N field down to bit 0 */
++ AND r0, r0, #0x1F /* Mask to leave just the 5 N bits */
++ BX lr
++
++
++
++ .global pmn_config @ export this function for the linker
++ /* Sets the event for a programmable counter to record */
++ /* void pmn_config(unsigned counter, uint32_t event) */
++ /* counter = r0 = Which counter to program (e.g. 0 for PMN0, 1 for PMN1 */
++ /* event = r1 = The event code */
++pmn_config:
++ AND r0, r0, #0x1F /* Mask to leave only bits 4:0 */
++ MCR p15, 0, r0, c9, c12, 5 /* Write PMNXSEL Register */
++ MCR p15, 0, r1, c9, c13, 1 /* Write EVTSELx Register */
++ BX lr
++
++
++
++ .global ccnt_divider @ export this function for the linker
++ /* Enables/disables the divider (1/64) on CCNT */
++ /* void ccnt_divider(int divider) */
++ /* divider = r0 = If 0 disable divider, else enable dvider */
++ccnt_divider:
++ MRC p15, 0, r1, c9, c12, 0 /* Read PMNC */
++
++ CMP r0, #0x0 /* IF (r0 == 0) */
++ BICEQ r1, r1, #0x08 /* THEN: Clear the D bit (disables the */
++ ORRNE r1, r1, #0x08 /* ELSE: Set the D bit (enables the di */
++
++ MCR p15, 0, r1, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++ /* --------------------------------------------------------------- */
++ /* Enable/Disable */
++ /* --------------------------------------------------------------- */
++
++ .global enable_pmu @ export this function for the linker
++ /* Global PMU enable */
++ /* void enable_pmu(void) */
++enable_pmu:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ ORR r0, r0, #0x01 /* Set E bit */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++
++ .global disable_pmu @ export this function for the linker
++ /* Global PMU disable */
++ /* void disable_pmu(void) */
++disable_pmu:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ BIC r0, r0, #0x01 /* Clear E bit */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++
++ .global enable_ccnt @ export this function for the linker
++ /* Enable the CCNT */
++ /* void enable_ccnt(void) */
++enable_ccnt:
++ MOV r0, #0x80000000 /* Set C bit */
++ MCR p15, 0, r0, c9, c12, 1 /* Write CNTENS Register */
++ BX lr
++
++
++
++ .global disable_ccnt @ export this function for the linker
++ /* Disable the CCNT */
++ /* void disable_ccnt(void) */
++disable_ccnt:
++ MOV r0, #0x80000000 /* Clear C bit */
++ MCR p15, 0, r0, c9, c12, 2 /* Write CNTENC Register */
++ BX lr
++
++
++
++ .global enable_pmn @ export this function for the linker
++ /* Enable PMN{n} */
++ /* void enable_pmn(uint32_t counter) */
++ /* counter = r0 = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++enable_pmn: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter t */
++ MOV r1, r1, LSL r0
++
++ MCR p15, 0, r1, c9, c12, 1 /* Write CNTENS Register */
++ BX lr
++
++
++
++ .global disable_pmn @ export this function for the linker
++ /* Enable PMN{n} */
++ /* void disable_pmn(uint32_t counter) */
++ /* counter = r0 = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++disable_pmn: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter t */
++ MOV r1, r1, LSL r0
++
++ MCR p15, 0, r1, c9, c12, 1 /* Write CNTENS Register */
++ BX lr
++
++
++
++ .global enable_pmu_user_access @ export this function for the linker
++ /* Enables User mode access to the PMU (must be called in a priviledge */
++ /* void enable_pmu_user_access(void) */
++enable_pmu_user_access:
++ MRC p15, 0, r0, c9, c14, 0 /* Read PMUSERENR Register */
++ ORR r0, r0, #0x01 /* Set EN bit (bit 0) */
++ MCR p15, 0, r0, c9, c14, 0 /* Write PMUSERENR Register */
++ BX lr
++
++
++
++ .global disable_pmu_user_access @ export this function for the linke
++ /* Disables User mode access to the PMU (must be called in a priviledg */
++ /* void disable_pmu_user_access(void) */
++disable_pmu_user_access:
++ MRC p15, 0, r0, c9, c14, 0 /* Read PMUSERENR Register */
++ BIC r0, r0, #0x01 /* Clear EN bit (bit 0) */
++ MCR p15, 0, r0, c9, c14, 0 /* Write PMUSERENR Register */
++ BX lr
++
++
++ /* --------------------------------------------------------------- */
++ /* Counter read registers */
++ /* --------------------------------------------------------------- */
++
++ .global read_ccnt @ export this function for the linker
++ /* Returns the value of CCNT */
++ /* uint32_t read_ccnt(void) */
++read_ccnt:
++ MRC p15, 0, r0, c9, c13, 0 /* Read CCNT Register */
++ BX lr
++
++
++ .global read_pmn @ export this function for the linker
++ /* Returns the value of PMN{n} */
++ /* uint32_t read_pmn(uint32_t counter) */
++ /* counter = r0 = The counter to read (e.g. 0 for PMN0, 1 for PMN1) *
++read_pmn: */
++ AND r0, r0, #0x1F /* Mask to leave only bits 4:0 */
++ MCR p15, 0, r0, c9, c12, 5 /* Write PMNXSEL Register */
++ MRC p15, 0, r0, c9, c13, 2 /* Read current PMNx Register */
++ BX lr
++
++
++ /* --------------------------------------------------------------- */
++ /* Software Increment */
++ /* --------------------------------------------------------------- */
++
++ .global pmu_software_increment @ export this function for the linker
++ /* Writes to software increment register */
++ /* void pmu_software_increment(uint32_t counter) */
++ /* counter = r0 = The counter to increment (e.g. 0 for PMN0, 1 for PMN
++pmu_software_increment: */
++ MOV r1, #0x01
++ MOV r1, r1, LSL r0
++ MCR p15, 0, r1, c9, c12, 4 /* Write SWINCR Register */
++ BX lr
++
++ /* --------------------------------------------------------------- */
++ /* Overflow & Interrupt Generation */
++ /* --------------------------------------------------------------- */
++
++ .global read_flags @ export this function for the linker
++ /* Returns the value of the overflow flags */
++ /* uint32_t read_flags(void) */
++read_flags:
++ MRC p15, 0, r0, c9, c12, 3 /* Read FLAG Register */
++ BX lr
++
++
++ .global write_flags @ export this function for the linker
++ /* Writes the overflow flags */
++ /* void write_flags(uint32_t flags) */
++write_flags:
++ MCR p15, 0, r0, c9, c12, 3 /* Write FLAG Register */
++ BX lr
++
++
++ .global enable_ccnt_irq @ export this function for the linker
++ /* Enables interrupt generation on overflow of the CCNT */
++ /* void enable_ccnt_irq(void) */
++enable_ccnt_irq:
++ MOV r0, #0x80000000
++ MCR p15, 0, r0, c9, c14, 1 /* Write INTENS Register */
++ BX lr
++
++ .global disable_ccnt_irq @ export this function for the linker
++ /* Disables interrupt generation on overflow of the CCNT */
++ /* void disable_ccnt_irq(void) */
++disable_ccnt_irq:
++ MOV r0, #0x80000000
++ MCR p15, 0, r0, c9, c14, 2 /* Write INTENC Register */
++ BX lr
++
++
++ .global enable_pmn_irq @ export this function for the linker
++ /* Enables interrupt generation on overflow of PMN{x} */
++ /* void enable_pmn_irq(uint32_t counter) */
++ /* counter = r0 = The counter to enable the interrupt for (e.g. 0 for
++enable_pmn_irq: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter */
++ MOV r0, r1, LSL r0
++ MCR p15, 0, r0, c9, c14, 1 /* Write INTENS Register */
++ BX lr
++
++ .global disable_pmn_irq @ export this function for the linker
++ /* Disables interrupt generation on overflow of PMN{x} */
++ /* void disable_pmn_irq(uint32_t counter) */
++ /* counter = r0 = The counter to disable the interrupt for (e.g. 0 fo
++disable_pmn_irq: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter t */
++ MOV r0, r1, LSL r0
++ MCR p15, 0, r0, c9, c14, 2 /* Write INTENC Register */
++ BX lr
++
++ /* --------------------------------------------------------------- */
++ /* Reset Functions */
++ /* --------------------------------------------------------------- */
++
++ .global reset_pmn @ export this function for the linker
++ /* Resets the programmable counters */
++ /* void reset_pmn(void) */
++reset_pmn:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ ORR r0, r0, #0x02 /* Set P bit (Event Counter Reset) */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++ .global reset_ccnt @ export this function for the linker
++ /* Resets the CCNT */
++ /* void reset_ccnt(void) */
++reset_ccnt:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ ORR r0, r0, #0x04 /* Set C bit (Event Counter Reset) */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++ .end @end of code, this line is optional.
++/* ------------------------------------------------------------ */
++/* End of v7_pmu.s */
++/* ------------------------------------------------------------ */
++
++
+--- /dev/null
++++ b/modules/hw/mmal/v7_pmu.h
+@@ -0,0 +1,113 @@
++// ------------------------------------------------------------
++// PMU for Cortex-A/R (v7-A/R)
++// ------------------------------------------------------------
++
++#ifndef _V7_PMU_H
++#define _V7_PMU_H
++
++// Returns the number of progammable counters
++unsigned int getPMN(void);
++
++// Sets the event for a programmable counter to record
++// counter = r0 = Which counter to program (e.g. 0 for PMN0, 1 for PMN1)
++// event = r1 = The event code (from appropiate TRM or ARM Architecture Reference Manual)
++void pmn_config(unsigned int counter, unsigned int event);
++
++// Enables/disables the divider (1/64) on CCNT
++// divider = r0 = If 0 disable divider, else enable dvider
++void ccnt_divider(int divider);
++
++//
++// Enables and disables
++//
++
++// Global PMU enable
++// On ARM11 this enables the PMU, and the counters start immediately
++// On Cortex this enables the PMU, there are individual enables for the counters
++void enable_pmu(void);
++
++// Global PMU disable
++// On Cortex, this overrides the enable state of the individual counters
++void disable_pmu(void);
++
++// Enable the CCNT
++void enable_ccnt(void);
++
++// Disable the CCNT
++void disable_ccnt(void);
++
++// Enable PMN{n}
++// counter = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++void enable_pmn(unsigned int counter);
++
++// Enable PMN{n}
++// counter = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++void disable_pmn(unsigned int counter);
++
++//
++// Read counter values
++//
++
++// Returns the value of CCNT
++unsigned int read_ccnt(void);
++
++// Returns the value of PMN{n}
++// counter = The counter to read (e.g. 0 for PMN0, 1 for PMN1)
++unsigned int read_pmn(unsigned int counter);
++
++//
++// Overflow and interrupts
++//
++
++// Returns the value of the overflow flags
++unsigned int read_flags(void);
++
++// Writes the overflow flags
++void write_flags(unsigned int flags);
++
++// Enables interrupt generation on overflow of the CCNT
++void enable_ccnt_irq(void);
++
++// Disables interrupt generation on overflow of the CCNT
++void disable_ccnt_irq(void);
++
++// Enables interrupt generation on overflow of PMN{x}
++// counter = The counter to enable the interrupt for (e.g. 0 for PMN0, 1 for PMN1)
++void enable_pmn_irq(unsigned int counter);
++
++// Disables interrupt generation on overflow of PMN{x}
++// counter = r0 = The counter to disable the interrupt for (e.g. 0 for PMN0, 1 for PMN1)
++void disable_pmn_irq(unsigned int counter);
++
++//
++// Counter reset functions
++//
++
++// Resets the programmable counters
++void reset_pmn(void);
++
++// Resets the CCNT
++void reset_ccnt(void);
++
++//
++// Software Increment
++
++// Writes to software increment register
++// counter = The counter to increment (e.g. 0 for PMN0, 1 for PMN1)
++void pmu_software_increment(unsigned int counter);
++
++//
++// User mode access
++//
++
++// Enables User mode access to the PMU (must be called in a priviledged mode)
++void enable_pmu_user_access(void);
++
++// Disables User mode access to the PMU (must be called in a priviledged mode)
++void disable_pmu_user_access(void);
++
++#endif
++// ------------------------------------------------------------
++// End of v7_pmu.h
++// ------------------------------------------------------------
++
+--- a/modules/hw/mmal/vout.c
++++ b/modules/hw/mmal/vout.c
+@@ -27,21 +27,28 @@
+ #endif
+
+ #include <math.h>
++#include <stdatomic.h>
+
+ #include <vlc_common.h>
+-#include <vlc_atomic.h>
+ #include <vlc_plugin.h>
+ #include <vlc_threads.h>
+ #include <vlc_vout_display.h>
++#include <vlc_modules.h>
+
+-#include "mmal_picture.h"
+-
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Wbad-function-cast"
+ #include <bcm_host.h>
++#pragma GCC diagnostic pop
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/util/mmal_util.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/vmcs_host/vc_tvservice.h>
+-#include <interface/vmcs_host/vc_dispmanx.h>
++
++#include "mmal_picture.h"
++#include "subpic.h"
++#include "transform_ops.h"
++
++#define TRACE_ALL 0
+
+ #define MAX_BUFFERS_IN_TRANSIT 1
+ #define VC_TV_MAX_MODE_IDS 127
+@@ -50,10 +57,28 @@
+ #define MMAL_LAYER_TEXT N_("VideoCore layer where the video is displayed.")
+ #define MMAL_LAYER_LONGTEXT N_("VideoCore layer where the video is displayed. Subpictures are displayed directly above and a black background directly below.")
+
+-#define MMAL_BLANK_BACKGROUND_NAME "mmal-blank-background"
+-#define MMAL_BLANK_BACKGROUND_TEXT N_("Blank screen below video.")
+-#define MMAL_BLANK_BACKGROUND_LONGTEXT N_("Render blank screen below video. " \
+- "Increases VideoCore load.")
++#define MMAL_DISPLAY_NAME "mmal-display"
++#define MMAL_DISPLAY_TEXT N_("Output device for Rpi fullscreen.")
++#define MMAL_DISPLAY_LONGTEXT N_("Output device for Rpi fullscreen. " \
++"Valid values are HDMI-1,HDMI-2. By default if qt-fullscreen-screennumber " \
++"is specified (or set by Fullscreen Output Device in Preferences) " \
++"HDMI-<qt-fullscreen-screennumber+1> will be used, otherwise HDMI-1.")
++
++#define MMAL_VOUT_TRANSFORM_NAME "mmal-vout-transform"
++#define MMAL_VOUT_TRANSFORM_TEXT N_("Video transform for Rpi fullscreen.")
++#define MMAL_VOUT_TRANSFORM_LONGTEXT N_("Video transform for Rpi fullscreen."\
++"Transforms availible: auto, 0, 90, 180, 270, hflip, vflip, transpose, antitranspose")
++
++#define MMAL_VOUT_WINDOW_NAME "mmal-vout-window"
++#define MMAL_VOUT_WINDOW_TEXT N_("Display window for Rpi fullscreen")
++#define MMAL_VOUT_WINDOW_LONGTEXT N_("Display window for Rpi fullscreen."\
++"fullscreen|<width>x<height>+<x>+<y>")
++
++#define MMAL_VOUT_TRANSPARENT_NAME "mmal-vout-transparent"
++#define MMAL_VOUT_TRANSPARENT_TEXT N_("Enable layers beneeth the vodeo layer.")
++#define MMAL_VOUT_TRANSPARENT_LONGTEXT N_("Enable layers beneath the video layer."\
++" By default these are disabled."\
++" Having the lower layers enabled can impact video performance")
+
+ #define MMAL_ADJUST_REFRESHRATE_NAME "mmal-adjust-refreshrate"
+ #define MMAL_ADJUST_REFRESHRATE_TEXT N_("Adjust HDMI refresh rate to the video.")
+@@ -68,332 +93,628 @@
+ #define PHASE_OFFSET_TARGET ((double)0.25)
+ #define PHASE_CHECK_INTERVAL 100
+
+-static int Open(vlc_object_t *);
+-static void Close(vlc_object_t *);
+-
+-vlc_module_begin()
+- set_shortname(N_("MMAL vout"))
+- set_description(N_("MMAL-based vout plugin for Raspberry Pi"))
+- set_capability("vout display", 90)
+- add_shortcut("mmal_vout")
+- add_integer(MMAL_LAYER_NAME, 1, MMAL_LAYER_TEXT, MMAL_LAYER_LONGTEXT, false)
+- add_bool(MMAL_BLANK_BACKGROUND_NAME, true, MMAL_BLANK_BACKGROUND_TEXT,
+- MMAL_BLANK_BACKGROUND_LONGTEXT, true);
+- add_bool(MMAL_ADJUST_REFRESHRATE_NAME, false, MMAL_ADJUST_REFRESHRATE_TEXT,
+- MMAL_ADJUST_REFRESHRATE_LONGTEXT, false)
+- add_bool(MMAL_NATIVE_INTERLACED, false, MMAL_NATIVE_INTERLACE_TEXT,
+- MMAL_NATIVE_INTERLACE_LONGTEXT, false)
+- set_callbacks(Open, Close)
+-vlc_module_end()
++#define SUBS_MAX 4
+
+-struct dmx_region_t {
+- struct dmx_region_t *next;
+- picture_t *picture;
+- VC_RECT_T bmp_rect;
+- VC_RECT_T src_rect;
+- VC_RECT_T dst_rect;
+- VC_DISPMANX_ALPHA_T alpha;
+- DISPMANX_ELEMENT_HANDLE_T element;
+- DISPMANX_RESOURCE_HANDLE_T resource;
+- int32_t pos_x;
+- int32_t pos_y;
+-};
++typedef struct vout_subpic_s {
++ MMAL_COMPONENT_T *component;
++ subpic_reg_stash_t sub;
++} vout_subpic_t;
+
+ struct vout_display_sys_t {
+- vlc_cond_t buffer_cond;
+- vlc_mutex_t buffer_mutex;
+ vlc_mutex_t manage_mutex;
+
+- plane_t planes[3]; /* Depending on video format up to 3 planes are used */
+- picture_t **pictures; /* Actual list of alloced pictures passed into picture_pool */
+- picture_pool_t *picture_pool;
+-
++ vcsm_init_type_t init_type;
+ MMAL_COMPONENT_T *component;
+ MMAL_PORT_T *input;
+ MMAL_POOL_T *pool; /* mmal buffer headers, used for pushing pictures to component*/
+- struct dmx_region_t *dmx_region;
+ int i_planes; /* Number of actually used planes, 1 for opaque, 3 for i420 */
+
+- uint32_t buffer_size; /* size of actual mmal buffers */
+ int buffers_in_transit; /* number of buffers currently pushed to mmal component */
+ unsigned num_buffers; /* number of buffers allocated at mmal port */
+
+- DISPMANX_DISPLAY_HANDLE_T dmx_handle;
+- DISPMANX_ELEMENT_HANDLE_T bkg_element;
+- DISPMANX_RESOURCE_HANDLE_T bkg_resource;
+- unsigned display_width;
+- unsigned display_height;
++ int display_id;
++ MMAL_RECT_T win_rect; // Window rect after transform(s)
++ MMAL_RECT_T display_rect; // Actual shape of display (x, y always 0)
++ MMAL_RECT_T req_win; // User requested window (w=0 => fullscreen)
++
++ MMAL_RECT_T spu_rect; // Output rectangle in cfg coords (for subpic placement)
++ MMAL_RECT_T dest_rect; // Output rectangle in display coords
++ MMAL_DISPLAYTRANSFORM_T dest_transform; // Dest window coord transform
++ MMAL_DISPLAYTRANSFORM_T display_transform; // "Native" display transform
++ MMAL_DISPLAYTRANSFORM_T video_transform; // Combined config+native transform
+
+- int i_frame_rate_base; /* cached framerate to detect changes for rate adjustment */
+- int i_frame_rate;
++ unsigned int i_frame_rate_base; /* cached framerate to detect changes for rate adjustment */
++ unsigned int i_frame_rate;
+
+ int next_phase_check; /* lowpass for phase check frequency */
+ int phase_offset; /* currently applied offset to presentation time in ns */
+ int layer; /* the dispman layer (z-index) used for video rendering */
++ bool transparent; // Do not disable layers beneath ours
+
+ bool need_configure_display; /* indicates a required display reconfigure to main thread */
+ bool adjust_refresh_rate;
+ bool native_interlaced;
+ bool b_top_field_first; /* cached interlaced settings to detect changes for native mode */
+ bool b_progressive;
+- bool opaque; /* indicated use of opaque picture format (zerocopy) */
+-};
++ bool force_config;
+
+-static const vlc_fourcc_t subpicture_chromas[] = {
+- VLC_CODEC_RGBA,
+- 0
+-};
++ vout_subpic_t subs[SUBS_MAX];
++ // Stash for subpics derived from the passed subpicture rather than
++ // included with the main pic
++ MMAL_BUFFER_HEADER_T * subpic_bufs[SUBS_MAX];
++
++ picture_pool_t * pic_pool;
++
++ struct vout_isp_conf_s {
++ MMAL_COMPONENT_T *component;
++ MMAL_PORT_T * input;
++ MMAL_PORT_T * output;
++ MMAL_QUEUE_T * out_q;
++ MMAL_POOL_T * in_pool;
++ MMAL_POOL_T * out_pool;
++ bool pending;
++ } isp;
+
+-/* Utility functions */
+-static inline uint32_t align(uint32_t x, uint32_t y);
+-static int configure_display(vout_display_t *vd, const vout_display_cfg_t *cfg,
+- const video_format_t *fmt);
++ MMAL_POOL_T * copy_pool;
++ MMAL_BUFFER_HEADER_T * copy_buf;
+
+-/* VLC vout display callbacks */
+-static picture_pool_t *vd_pool(vout_display_t *vd, unsigned count);
+-static void vd_prepare(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture);
+-static void vd_display(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture);
+-static int vd_control(vout_display_t *vd, int query, va_list args);
+-static void vd_manage(vout_display_t *vd);
+-
+-/* MMAL callbacks */
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
++ // Subpic blend if we have to do it here
++ vzc_pool_ctl_t * vzc;
++};
+
+-/* TV service */
+-static int query_resolution(vout_display_t *vd, unsigned *width, unsigned *height);
+-static void tvservice_cb(void *callback_data, uint32_t reason, uint32_t param1,
+- uint32_t param2);
+-static void adjust_refresh_rate(vout_display_t *vd, const video_format_t *fmt);
+-static int set_latency_target(vout_display_t *vd, bool enable);
+
+-/* DispManX */
+-static void display_subpicture(vout_display_t *vd, subpicture_t *subpicture);
+-static void close_dmx(vout_display_t *vd);
+-static struct dmx_region_t *dmx_region_new(vout_display_t *vd,
+- DISPMANX_UPDATE_HANDLE_T update, subpicture_region_t *region);
+-static void dmx_region_update(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update, picture_t *picture);
+-static void dmx_region_delete(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update);
+-static void show_background(vout_display_t *vd, bool enable);
+-static void maintain_phase_sync(vout_display_t *vd);
++// ISP setup
+
+-static int Open(vlc_object_t *object)
++static inline bool want_isp(const vout_display_t * const vd)
+ {
+- vout_display_t *vd = (vout_display_t *)object;
+- vout_display_sys_t *sys;
+- uint32_t buffer_pitch, buffer_height;
+- vout_display_place_t place;
+- MMAL_DISPLAYREGION_T display_region;
+- MMAL_STATUS_T status;
+- int ret = VLC_SUCCESS;
+- unsigned i;
++ return (vd->fmt.i_chroma == VLC_CODEC_MMAL_ZC_SAND10);
++}
+
+- if (vout_display_IsWindowed(vd))
+- return VLC_EGENERIC;
++static inline bool want_copy(const vout_display_t * const vd)
++{
++ return (vd->fmt.i_chroma == VLC_CODEC_I420 || vd->fmt.i_chroma == VLC_CODEC_I420_10L);
++}
+
+- sys = calloc(1, sizeof(struct vout_display_sys_t));
+- if (!sys)
+- return VLC_ENOMEM;
+- vd->sys = sys;
++static inline vlc_fourcc_t req_chroma(const vout_display_t * const vd)
++{
++ return !hw_mmal_chroma_is_mmal(vd->fmt.i_chroma) && !want_copy(vd) ?
++ VLC_CODEC_I420 :
++ vd->fmt.i_chroma;
++}
+
+- sys->layer = var_InheritInteger(vd, MMAL_LAYER_NAME);
+- bcm_host_init();
++static MMAL_FOURCC_T vout_vlc_to_mmal_pic_fourcc(const unsigned int fcc)
++{
++ switch (fcc){
++ case VLC_CODEC_MMAL_OPAQUE:
++ return MMAL_ENCODING_OPAQUE;
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ return MMAL_ENCODING_YUVUV128;
++ case VLC_CODEC_MMAL_ZC_SAND10:
++ return MMAL_ENCODING_YUVUV64_10;
++ case VLC_CODEC_MMAL_ZC_SAND30:
++ return MMAL_ENCODING_YUV10_COL;
++ case VLC_CODEC_MMAL_ZC_I420:
++ case VLC_CODEC_I420:
++ return MMAL_ENCODING_I420;
++ default:
++ break;
++ }
++ return MMAL_ENCODING_I420;
++}
+
+- sys->opaque = vd->fmt.i_chroma == VLC_CODEC_MMAL_OPAQUE;
++static void display_set_format(const vout_display_t * const vd, MMAL_ES_FORMAT_T *const es_fmt, const bool is_intermediate)
++{
++ const unsigned int w = is_intermediate ? vd->fmt.i_visible_width : vd->fmt.i_width ;
++ const unsigned int h = is_intermediate ? vd->fmt.i_visible_height : vd->fmt.i_height;
++ MMAL_VIDEO_FORMAT_T * const v_fmt = &es_fmt->es->video;
+
+- status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = is_intermediate ? MMAL_ENCODING_I420 : vout_vlc_to_mmal_pic_fourcc(vd->fmt.i_chroma);
++ es_fmt->encoding_variant = 0;
++
++ v_fmt->width = (w + 31) & ~31;
++ v_fmt->height = (h + 15) & ~15;
++ v_fmt->crop.x = 0;
++ v_fmt->crop.y = 0;
++ v_fmt->crop.width = w;
++ v_fmt->crop.height = h;
++ if (vd->fmt.i_sar_num == 0 || vd->fmt.i_sar_den == 0) {
++ v_fmt->par.num = 1;
++ v_fmt->par.den = 1;
++ } else {
++ v_fmt->par.num = vd->fmt.i_sar_num;
++ v_fmt->par.den = vd->fmt.i_sar_den;
+ }
++ v_fmt->frame_rate.num = vd->fmt.i_frame_rate;
++ v_fmt->frame_rate.den = vd->fmt.i_frame_rate_base;
++ v_fmt->color_space = vlc_to_mmal_color_space(vd->fmt.space);
+
+- sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
+- status = mmal_port_enable(sys->component->control, control_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to enable control port %s (status=%"PRIx32" %s)",
+- sys->component->control->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ msg_Dbg(vd, "WxH: %dx%d, Crop: %dx%d", v_fmt->width, v_fmt->height, v_fmt->crop.width, v_fmt->crop.height);
++}
++
++static MMAL_RECT_T
++display_src_rect(const vout_display_t * const vd, const video_format_t * const src)
++{
++ const bool wants_isp = want_isp(vd);
++
++ // Scale source derived cropping to actual picture shape
++ return (MMAL_RECT_T){
++ .x = wants_isp ? 0 : src->i_x_offset * vd->fmt.i_width / src->i_width,
++ .y = wants_isp ? 0 : src->i_y_offset * vd->fmt.i_height / src->i_height,
++ .width = src->i_visible_width * vd->fmt.i_width / src->i_width,
++ .height = src->i_visible_height * vd->fmt.i_height / src->i_height
++ };
++}
++
++static void isp_input_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++#if TRACE_ALL
++ vout_display_t * const vd = (vout_display_t *)port->userdata;
++ pic_ctx_mmal_t * ctx = buf->user_data;
++ msg_Dbg(vd, "<<< %s: cmd=%d, ctx=%p, buf=%p, flags=%#x, pts=%lld", __func__, buf->cmd, ctx, buf,
++ buf->flags, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf);
++
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s", __func__);
++#endif
++}
++
++static void isp_control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ vout_display_t *vd = (vout_display_t *)port->userdata;
++ MMAL_STATUS_T status;
++
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ msg_Err(vd, "MMAL error %"PRIx32" \"%s\"", status, mmal_status_to_string(status));
+ }
+
+- sys->input = sys->component->input[0];
+- sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++ mmal_buffer_header_release(buffer);
++}
+
+- if (sys->opaque) {
+- sys->input->format->encoding = MMAL_ENCODING_OPAQUE;
+- sys->i_planes = 1;
+- sys->buffer_size = sys->input->buffer_size_recommended;
+- } else {
+- sys->input->format->encoding = MMAL_ENCODING_I420;
+- vd->fmt.i_chroma = VLC_CODEC_I420;
+- buffer_pitch = align(vd->fmt.i_width, 32);
+- buffer_height = align(vd->fmt.i_height, 16);
+- sys->i_planes = 3;
+- sys->buffer_size = 3 * buffer_pitch * buffer_height / 2;
+- }
+-
+- sys->input->format->es->video.width = vd->fmt.i_width;
+- sys->input->format->es->video.height = vd->fmt.i_height;
+- sys->input->format->es->video.crop.x = 0;
+- sys->input->format->es->video.crop.y = 0;
+- sys->input->format->es->video.crop.width = vd->fmt.i_width;
+- sys->input->format->es->video.crop.height = vd->fmt.i_height;
+- sys->input->format->es->video.par.num = vd->source.i_sar_num;
+- sys->input->format->es->video.par.den = vd->source.i_sar_den;
++static void isp_output_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++ if (buf->cmd == 0 && buf->length != 0)
++ {
++ // The filter structure etc. should always exist if we have contents
++ // but might not on later flushes as we shut down
++ vout_display_t * const vd = (vout_display_t *)port->userdata;
++ struct vout_isp_conf_s *const isp = &vd->sys->isp;
+
+- status = mmal_port_format_commit(sys->input);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s: cmd=%d; flags=%#x, pts=%lld", __func__, buf->cmd, buf->flags, (long long) buf->pts);
++#endif
++ mmal_queue_put(isp->out_q, buf);
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s: out Q len=%d", __func__, mmal_queue_length(isp->out_q));
++#endif
+ }
+- sys->input->buffer_size = sys->input->buffer_size_recommended;
++ else
++ {
++ mmal_buffer_header_reset(buf);
++ mmal_buffer_header_release(buf);
++ }
++}
+
+- vout_display_PlacePicture(&place, &vd->source, vd->cfg, false);
+- display_region.hdr.id = MMAL_PARAMETER_DISPLAYREGION;
+- display_region.hdr.size = sizeof(MMAL_DISPLAYREGION_T);
+- display_region.fullscreen = MMAL_FALSE;
+- display_region.src_rect.x = vd->fmt.i_x_offset;
+- display_region.src_rect.y = vd->fmt.i_y_offset;
+- display_region.src_rect.width = vd->fmt.i_visible_width;
+- display_region.src_rect.height = vd->fmt.i_visible_height;
+- display_region.dest_rect.x = place.x;
+- display_region.dest_rect.y = place.y;
+- display_region.dest_rect.width = place.width;
+- display_region.dest_rect.height = place.height;
+- display_region.layer = sys->layer;
+- display_region.set = MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_SRC_RECT |
+- MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_LAYER;
+- status = mmal_port_parameter_set(sys->input, &display_region.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to set display region (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++static void isp_empty_out_q(struct vout_isp_conf_s * const isp)
++{
++ MMAL_BUFFER_HEADER_T * buf;
++ // We can be called as part of error recovery so allow for missing Q
++ if (isp->out_q == NULL)
++ return;
++
++ while ((buf = mmal_queue_get(isp->out_q)) != NULL)
++ mmal_buffer_header_release(buf);
++}
++
++static void isp_flush(struct vout_isp_conf_s * const isp)
++{
++ if (!isp->input->is_enabled)
++ mmal_port_disable(isp->input);
++
++ if (isp->output->is_enabled)
++ mmal_port_disable(isp->output);
++
++ isp_empty_out_q(isp);
++ isp->pending = false;
++}
++
++static MMAL_STATUS_T isp_prepare(vout_display_t * const vd, struct vout_isp_conf_s * const isp)
++{
++ MMAL_STATUS_T err;
++ MMAL_BUFFER_HEADER_T * buf;
++
++ if (!isp->output->is_enabled) {
++ if ((err = mmal_port_enable(isp->output, isp_output_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "ISP output port enable failed");
++ return err;
++ }
+ }
+
+- for (i = 0; i < sys->i_planes; ++i) {
+- sys->planes[i].i_lines = buffer_height;
+- sys->planes[i].i_pitch = buffer_pitch;
+- sys->planes[i].i_visible_lines = vd->fmt.i_visible_height;
+- sys->planes[i].i_visible_pitch = vd->fmt.i_visible_width;
++ while ((buf = mmal_queue_get(isp->out_pool->queue)) != NULL) {
++ if ((err = mmal_port_send_buffer(isp->output, buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "ISP output port stuff failed");
++ return err;
++ }
++ }
+
+- if (i > 0) {
+- sys->planes[i].i_lines /= 2;
+- sys->planes[i].i_pitch /= 2;
+- sys->planes[i].i_visible_lines /= 2;
+- sys->planes[i].i_visible_pitch /= 2;
++ if (!isp->input->is_enabled) {
++ if ((err = mmal_port_enable(isp->input, isp_input_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "ISP input port enable failed");
++ return err;
+ }
+ }
++ return MMAL_SUCCESS;
++}
+
+- vlc_mutex_init(&sys->buffer_mutex);
+- vlc_cond_init(&sys->buffer_cond);
+- vlc_mutex_init(&sys->manage_mutex);
++static void isp_close(vout_display_t * const vd, vout_display_sys_t * const vd_sys)
++{
++ struct vout_isp_conf_s * const isp = &vd_sys->isp;
++ VLC_UNUSED(vd);
+
+- vd->pool = vd_pool;
+- vd->prepare = vd_prepare;
+- vd->display = vd_display;
+- vd->control = vd_control;
+- vd->manage = vd_manage;
++ if (isp->component == NULL)
++ return;
+
+- vc_tv_register_callback(tvservice_cb, vd);
++ isp_flush(isp);
+
+- if (query_resolution(vd, &sys->display_width, &sys->display_height) >= 0) {
+- vout_display_SendEventDisplaySize(vd, sys->display_width, sys->display_height);
+- } else {
+- sys->display_width = vd->cfg->display.width;
+- sys->display_height = vd->cfg->display.height;
++ if (isp->component->control->is_enabled)
++ mmal_port_disable(isp->component->control);
++
++ if (isp->out_q != NULL) {
++ // 1st junk anything lying around
++ isp_empty_out_q(isp);
++
++ mmal_queue_destroy(isp->out_q);
++ isp->out_q = NULL;
+ }
+
+- sys->dmx_handle = vc_dispmanx_display_open(0);
+- vd->info.subpicture_chromas = subpicture_chromas;
++ if (isp->out_pool != NULL) {
++ mmal_port_pool_destroy(isp->output, isp->out_pool);
++ isp->out_pool = NULL;
++ }
+
+- vout_display_DeleteWindow(vd, NULL);
++ isp->input = NULL;
++ isp->output = NULL;
+
+-out:
+- if (ret != VLC_SUCCESS)
+- Close(object);
++ mmal_component_release(isp->component);
++ isp->component = NULL;
+
+- return ret;
++ return;
+ }
+
+-static void Close(vlc_object_t *object)
++// Restuff into output rather than return to pool is we can
++static MMAL_BOOL_T isp_out_pool_cb(MMAL_POOL_T *pool, MMAL_BUFFER_HEADER_T *buffer, void *userdata)
+ {
+- vout_display_t *vd = (vout_display_t *)object;
+- vout_display_sys_t *sys = vd->sys;
+- char response[20]; /* answer is hvs_update_fields=%1d */
+- unsigned i;
++ struct vout_isp_conf_s * const isp = userdata;
++ VLC_UNUSED(pool);
++ if (isp->output->is_enabled) {
++ mmal_buffer_header_reset(buffer);
++ if (mmal_port_send_buffer(isp->output, buffer) == MMAL_SUCCESS)
++ return MMAL_FALSE;
++ }
++ return MMAL_TRUE;
++}
+
+- vc_tv_unregister_callback_full(tvservice_cb, vd);
++static MMAL_STATUS_T isp_setup(vout_display_t * const vd, vout_display_sys_t * const vd_sys)
++{
++ struct vout_isp_conf_s * const isp = &vd_sys->isp;
++ MMAL_STATUS_T err;
+
+- if (sys->dmx_handle)
+- close_dmx(vd);
++ if ((err = mmal_component_create(MMAL_COMPONENT_ISP_RESIZER, &isp->component)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Cannot create ISP component");
++ return err;
++ }
++ isp->input = isp->component->input[0];
++ isp->output = isp->component->output[0];
+
+- if (sys->component && sys->component->control->is_enabled)
+- mmal_port_disable(sys->component->control);
++ isp->component->control->userdata = (void *)vd;
++ if ((err = mmal_port_enable(isp->component->control, isp_control_port_cb)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable ISP control port");
++ goto fail;
++ }
+
+- if (sys->input && sys->input->is_enabled)
+- mmal_port_disable(sys->input);
++ isp->input->userdata = (void *)vd;
++ display_set_format(vd, isp->input->format, false);
+
+- if (sys->component && sys->component->is_enabled)
+- mmal_component_disable(sys->component);
++ if ((err = port_parameter_set_bool(isp->input, MMAL_PARAMETER_ZERO_COPY, true)) != MMAL_SUCCESS)
++ goto fail;
+
+- if (sys->pool)
+- mmal_port_pool_destroy(sys->input, sys->pool);
++ if ((err = mmal_port_format_commit(isp->input)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set ISP input format");
++ goto fail;
++ }
+
+- if (sys->component)
+- mmal_component_release(sys->component);
++ isp->input->buffer_size = isp->input->buffer_size_recommended;
++ isp->input->buffer_num = 30;
+
+- if (sys->picture_pool)
+- picture_pool_Release(sys->picture_pool);
+- else
+- for (i = 0; i < sys->num_buffers; ++i)
+- if (sys->pictures[i]) {
+- mmal_buffer_header_release(sys->pictures[i]->p_sys->buffer);
+- picture_Release(sys->pictures[i]);
+- }
++ if ((isp->in_pool = mmal_pool_create(isp->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(vd, "Failed to create input pool");
++ goto fail;
++ }
+
+- vlc_mutex_destroy(&sys->buffer_mutex);
+- vlc_cond_destroy(&sys->buffer_cond);
+- vlc_mutex_destroy(&sys->manage_mutex);
++ if ((isp->out_q = mmal_queue_create()) == NULL)
++ {
++ err = MMAL_ENOMEM;
++ goto fail;
++ }
+
+- if (sys->native_interlaced) {
+- if (vc_gencmd(response, sizeof(response), "hvs_update_fields 0") < 0 ||
+- response[18] != '0')
+- msg_Warn(vd, "Could not reset hvs field mode");
++ display_set_format(vd, isp->output->format, true);
++
++ if ((err = port_parameter_set_bool(isp->output, MMAL_PARAMETER_ZERO_COPY, true)) != MMAL_SUCCESS)
++ goto fail;
++
++ if ((err = mmal_port_format_commit(isp->output)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set ISP input format");
++ goto fail;
+ }
+
+- free(sys->pictures);
+- free(sys);
++ isp->output->buffer_size = isp->output->buffer_size_recommended;
++ isp->output->buffer_num = 2;
++ isp->output->userdata = (void *)vd;
++
++ if ((isp->out_pool = mmal_port_pool_create(isp->output, isp->output->buffer_num, isp->output->buffer_size)) == NULL)
++ {
++ msg_Err(vd, "Failed to make ISP port pool");
++ goto fail;
++ }
++
++ mmal_pool_callback_set(isp->out_pool, isp_out_pool_cb, isp);
++
++ if ((err = isp_prepare(vd, isp)) != MMAL_SUCCESS)
++ goto fail;
++
++ return MMAL_SUCCESS;
+
+- bcm_host_deinit();
++fail:
++ isp_close(vd, vd_sys);
++ return err;
+ }
+
+-static inline uint32_t align(uint32_t x, uint32_t y) {
+- uint32_t mod = x % y;
+- if (mod == 0)
+- return x;
++static MMAL_STATUS_T isp_check(vout_display_t * const vd, vout_display_sys_t * const vd_sys)
++{
++ struct vout_isp_conf_s *const isp = &vd_sys->isp;
++ const bool has_isp = (isp->component != NULL);
++ const bool wants_isp = want_isp(vd);
++
++ if (has_isp == wants_isp)
++ {
++ // All OK - do nothing
++ }
++ else if (has_isp)
++ {
++ // ISP active but we don't want it
++ isp_flush(isp);
++
++ // Check we have everything back and then kill it
++ if (mmal_queue_length(isp->out_pool->queue) == isp->output->buffer_num)
++ isp_close(vd, vd_sys);
++ }
+ else
+- return x + y - mod;
++ {
++ // ISP closed but we want it
++ return isp_setup(vd, vd_sys);
++ }
++
++ return MMAL_SUCCESS;
++}
++
++/* TV service */
++static void tvservice_cb(void *callback_data, uint32_t reason, uint32_t param1,
++ uint32_t param2);
++static void adjust_refresh_rate(vout_display_t *vd, const video_format_t *fmt);
++static int set_latency_target(vout_display_t *vd, bool enable);
++
++// Mmal
++static void maintain_phase_sync(vout_display_t *vd);
++
++
++
++static void vd_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++#if TRACE_ALL
++ vout_display_t * const vd = (vout_display_t *)port->userdata;
++ pic_ctx_mmal_t * ctx = buf->user_data;
++ msg_Dbg(vd, "<<< %s: cmd=%d, ctx=%p, buf=%p, flags=%#x, pts=%lld", __func__, buf->cmd, ctx, buf,
++ buf->flags, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf);
++
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s", __func__);
++#endif
++}
++
++static int query_resolution(vout_display_t *vd, const int display_id, unsigned *width, unsigned *height)
++{
++ TV_DISPLAY_STATE_T display_state = {0};
++ int ret = 0;
++
++ if (vc_tv_get_display_state_id(display_id, &display_state) == 0) {
++ msg_Dbg(vd, "State=%#x", display_state.state);
++ if (display_state.state & 0xFF) {
++ msg_Dbg(vd, "HDMI: %dx%d", display_state.display.hdmi.width, display_state.display.hdmi.height);
++ *width = display_state.display.hdmi.width;
++ *height = display_state.display.hdmi.height;
++ } else if (display_state.state & 0xFF00) {
++ msg_Dbg(vd, "SDTV: %dx%d", display_state.display.sdtv.width, display_state.display.sdtv.height);
++ *width = display_state.display.sdtv.width;
++ *height = display_state.display.sdtv.height;
++ } else {
++ msg_Warn(vd, "Invalid display state %"PRIx32, display_state.state);
++ ret = -1;
++ }
++ } else {
++ msg_Warn(vd, "Failed to query display resolution");
++ ret = -1;
++ }
++
++ return ret;
++}
++
++static inline MMAL_RECT_T
++place_to_mmal_rect(const vout_display_place_t place)
++{
++ return (MMAL_RECT_T){
++ .x = place.x,
++ .y = place.y,
++ .width = place.width,
++ .height = place.height
++ };
++}
++
++static MMAL_RECT_T
++place_out(const vout_display_cfg_t * cfg,
++ const video_format_t * fmt,
++ const MMAL_RECT_T r)
++{
++ video_format_t tfmt;
++ vout_display_cfg_t tcfg;
++ vout_display_place_t place;
++
++ // Fix SAR if unknown
++ if (fmt->i_sar_den == 0 || fmt->i_sar_num == 0) {
++ tfmt = *fmt;
++ tfmt.i_sar_den = 1;
++ tfmt.i_sar_num = 1;
++ fmt = &tfmt;
++ }
++
++ // Override what VLC thinks might be going on with display size
++ // if we know better
++ if (r.width != 0 && r.height != 0)
++ {
++ tcfg = *cfg;
++ tcfg.display.width = r.width;
++ tcfg.display.height = r.height;
++ cfg = &tcfg;
++ }
++
++ vout_display_PlacePicture(&place, fmt, cfg, false);
++
++ place.x += r.x;
++ place.y += r.y;
++
++ return place_to_mmal_rect(place);
++}
++
++static MMAL_RECT_T
++rect_transform(MMAL_RECT_T s, const MMAL_RECT_T c, const MMAL_DISPLAYTRANSFORM_T t)
++{
++ if (is_transform_transpose(t))
++ s = rect_transpose(s);
++ if (is_transform_hflip(t))
++ s = rect_hflip(s, c);
++ if (is_transform_vflip(t) != 0)
++ s = rect_vflip(s, c);
++ return s;
++}
++
++static void
++place_dest_rect(vout_display_t * const vd,
++ const vout_display_cfg_t * const cfg,
++ const video_format_t * fmt)
++{
++ vout_display_sys_t * const sys = vd->sys;
++ sys->dest_rect = rect_transform(place_out(cfg, fmt, sys->win_rect),
++ sys->display_rect, sys->dest_transform);
++}
++
++static void
++place_spu_rect(vout_display_t * const vd,
++ const vout_display_cfg_t * const cfg,
++ const video_format_t * fmt)
++{
++ vout_display_sys_t * const sys = vd->sys;
++ static const MMAL_RECT_T r0 = {0};
++
++ sys->spu_rect = place_out(cfg, fmt, r0);
++ sys->spu_rect.x = 0;
++ sys->spu_rect.y = 0;
++
++ // Copy place override logic for spu pos from video_output.c
++ // This info doesn't appear to reside anywhere natively
++
++ if (fmt->i_width * fmt->i_height >= (unsigned int)(sys->spu_rect.width * sys->spu_rect.height)) {
++ sys->spu_rect.width = fmt->i_visible_width;
++ sys->spu_rect.height = fmt->i_visible_height;
++ }
++
++ if (ORIENT_IS_SWAP(fmt->orientation))
++ sys->spu_rect = rect_transpose(sys->spu_rect);
++}
++
++static void
++place_rects(vout_display_t * const vd,
++ const vout_display_cfg_t * const cfg,
++ const video_format_t * fmt)
++{
++ place_dest_rect(vd, cfg, fmt);
++ place_spu_rect(vd, cfg, fmt);
++}
++
++static int
++set_input_region(vout_display_t * const vd, const video_format_t * const fmt)
++{
++ const vout_display_sys_t * const sys = vd->sys;
++ MMAL_DISPLAYREGION_T display_region = {
++ .hdr = {
++ .id = MMAL_PARAMETER_DISPLAYREGION,
++ .size = sizeof(MMAL_DISPLAYREGION_T)
++ },
++ .display_num = sys->display_id,
++ .fullscreen = MMAL_FALSE,
++ .transform = sys->video_transform,
++ .dest_rect = sys->dest_rect,
++ .src_rect = display_src_rect(vd, fmt),
++ .noaspect = MMAL_TRUE,
++ .mode = MMAL_DISPLAY_MODE_FILL,
++ .layer = sys->layer,
++ .alpha = 0xff | (sys->transparent ? 0 : (1 << 29)),
++ .set =
++ MMAL_DISPLAY_SET_NUM |
++ MMAL_DISPLAY_SET_FULLSCREEN |
++ MMAL_DISPLAY_SET_TRANSFORM |
++ MMAL_DISPLAY_SET_DEST_RECT |
++ MMAL_DISPLAY_SET_SRC_RECT |
++ MMAL_DISPLAY_SET_NOASPECT |
++ MMAL_DISPLAY_SET_MODE |
++ MMAL_DISPLAY_SET_LAYER |
++ MMAL_DISPLAY_SET_ALPHA
++ };
++ MMAL_STATUS_T status = mmal_port_parameter_set(sys->input, &display_region.hdr);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set display region (status=%"PRIx32" %s)",
++ status, mmal_status_to_string(status));
++ return -EINVAL;
++ }
++ return 0;
+ }
+
+ static int configure_display(vout_display_t *vd, const vout_display_cfg_t *cfg,
+ const video_format_t *fmt)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- vout_display_place_t place;
+- MMAL_DISPLAYREGION_T display_region;
++ vout_display_sys_t * const sys = vd->sys;
+ MMAL_STATUS_T status;
+
+ if (!cfg && !fmt)
++ {
++ msg_Err(vd, "%s: Missing cfg & fmt", __func__);
+ return -EINVAL;
++ }
++
++ isp_check(vd, sys);
+
+ if (fmt) {
+ sys->input->format->es->video.par.num = fmt->i_sar_num;
+@@ -412,30 +733,14 @@ static int configure_display(vout_displa
+ if (!cfg)
+ cfg = vd->cfg;
+
+- vout_display_PlacePicture(&place, fmt, cfg, false);
++ sys->video_transform = combine_transform(
++ vlc_to_mmal_transform(fmt->orientation), sys->display_transform);
+
+- display_region.hdr.id = MMAL_PARAMETER_DISPLAYREGION;
+- display_region.hdr.size = sizeof(MMAL_DISPLAYREGION_T);
+- display_region.fullscreen = MMAL_FALSE;
+- display_region.src_rect.x = fmt->i_x_offset;
+- display_region.src_rect.y = fmt->i_y_offset;
+- display_region.src_rect.width = fmt->i_visible_width;
+- display_region.src_rect.height = fmt->i_visible_height;
+- display_region.dest_rect.x = place.x;
+- display_region.dest_rect.y = place.y;
+- display_region.dest_rect.width = place.width;
+- display_region.dest_rect.height = place.height;
+- display_region.layer = sys->layer;
+- display_region.set = MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_SRC_RECT |
+- MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_LAYER;
+- status = mmal_port_parameter_set(sys->input, &display_region.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to set display region (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
++ place_rects(vd, cfg, fmt);
++
++ if (set_input_region(vd, fmt) != 0)
+ return -EINVAL;
+- }
+
+- show_background(vd, var_InheritBool(vd, MMAL_BLANK_BACKGROUND_NAME));
+ sys->adjust_refresh_rate = var_InheritBool(vd, MMAL_ADJUST_REFRESHRATE_NAME);
+ sys->native_interlaced = var_InheritBool(vd, MMAL_NATIVE_INTERLACED);
+ if (sys->adjust_refresh_rate) {
+@@ -446,204 +751,217 @@ static int configure_display(vout_displa
+ return 0;
+ }
+
++static void kill_pool(vout_display_sys_t * const sys)
++{
++ if (sys->pic_pool != NULL) {
++ picture_pool_Release(sys->pic_pool);
++ sys->pic_pool = NULL;
++ }
++}
++
++// Actual picture pool for MMAL opaques is just a set of trivial containers
+ static picture_pool_t *vd_pool(vout_display_t *vd, unsigned count)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- picture_resource_t picture_res;
+- picture_pool_configuration_t picture_pool_cfg;
+- video_format_t fmt = vd->fmt;
+- MMAL_STATUS_T status;
+- unsigned i;
++ vout_display_sys_t * const sys = vd->sys;
+
+- if (sys->picture_pool) {
+- if (sys->num_buffers < count)
+- msg_Warn(vd, "Picture pool with %u pictures requested, but we already have one with %u pictures",
+- count, sys->num_buffers);
++ msg_Dbg(vd, "%s: fmt:%dx%d,sar:%d/%d; source:%dx%d", __func__,
++ vd->fmt.i_width, vd->fmt.i_height, vd->fmt.i_sar_num, vd->fmt.i_sar_den, vd->source.i_width, vd->source.i_height);
+
+- goto out;
++ if (sys->pic_pool == NULL) {
++ sys->pic_pool = picture_pool_NewFromFormat(&vd->fmt, count);
+ }
++ return sys->pic_pool;
++}
+
+- if (sys->opaque) {
+- if (count <= NUM_ACTUAL_OPAQUE_BUFFERS)
+- count = NUM_ACTUAL_OPAQUE_BUFFERS;
++static inline bool
++check_shape(vout_display_t * const vd, const picture_t * const p_pic)
++{
++ if (vd->fmt.i_width == p_pic->format.i_width &&
++ vd->fmt.i_height == p_pic->format.i_height)
++ return true;
++ return false;
++}
+
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++static void vd_display(vout_display_t *vd, picture_t *p_pic,
++ subpicture_t *subpicture)
++{
++ vout_display_sys_t * const sys = vd->sys;
++ MMAL_STATUS_T err;
+
+- status = mmal_port_parameter_set(sys->input, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- goto out;
+- }
++#if TRACE_ALL
++ {
++ char dbuf0[5];
++ msg_Dbg(vd, "<<< %s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d -> %dx%d@%d,%d", __func__,
++ str_fourcc(dbuf0, p_pic->format.i_chroma), p_pic->format.i_width, p_pic->format.i_height,
++ p_pic->format.i_x_offset, p_pic->format.i_y_offset,
++ p_pic->format.i_visible_width, p_pic->format.i_visible_height,
++ p_pic->format.i_sar_num, p_pic->format.i_sar_den,
++ sys->dest_rect.width, sys->dest_rect.height, sys->dest_rect.x, sys->dest_rect.y);
+ }
+-
+- if (count < sys->input->buffer_num_recommended)
+- count = sys->input->buffer_num_recommended;
+-
+-#ifndef NDEBUG
+- msg_Dbg(vd, "Creating picture pool with %u pictures", count);
+ #endif
+
+- sys->input->buffer_num = count;
+- status = mmal_port_enable(sys->input, input_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to enable input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- goto out;
++ // If we had subpics then we have attached them to the main pic in prepare
++ // so all we have to do here is delete the refs
++ if (subpicture != NULL) {
++ subpicture_Delete(subpicture);
+ }
+
+- status = mmal_component_enable(sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to enable component %s (status=%"PRIx32" %s)",
+- sys->component->name, status, mmal_status_to_string(status));
+- goto out;
++ if (!check_shape(vd, p_pic))
++ {
++ msg_Err(vd, "Pic/fmt shape mismatch");
++ goto fail;
++ }
++
++ if (!sys->input->is_enabled &&
++ (err = mmal_port_enable(sys->input, vd_input_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "Input port enable failed");
++ goto fail;
++ }
++ // Stuff into input
++ // We assume the BH is already set up with values reflecting pic date etc.
++ if (sys->copy_buf != NULL) {
++ MMAL_BUFFER_HEADER_T *const buf = sys->copy_buf;
++ sys->copy_buf = NULL;
++#if TRACE_ALL
++ msg_Dbg(vd, "--- %s: Copy stuff", __func__);
++#endif
++ if (mmal_port_send_buffer(sys->input, buf) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(buf);
++ msg_Err(vd, "Send copy buffer to render input failed");
++ goto fail;
++ }
+ }
+-
+- sys->num_buffers = count;
+- sys->pool = mmal_port_pool_create(sys->input, sys->num_buffers,
+- sys->input->buffer_size);
+- if (!sys->pool) {
+- msg_Err(vd, "Failed to create MMAL pool for %u buffers of size %"PRIu32,
+- count, sys->input->buffer_size);
+- goto out;
++ else if (sys->isp.pending) {
++ MMAL_BUFFER_HEADER_T *const buf = mmal_queue_wait(sys->isp.out_q);
++ sys->isp.pending = false;
++#if TRACE_ALL
++ msg_Dbg(vd, "--- %s: ISP stuff", __func__);
++#endif
++ if (mmal_port_send_buffer(sys->input, buf) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(buf);
++ msg_Err(vd, "Send ISP buffer to render input failed");
++ goto fail;
++ }
+ }
+-
+- memset(&picture_res, 0, sizeof(picture_resource_t));
+- sys->pictures = calloc(sys->num_buffers, sizeof(picture_t *));
+- for (i = 0; i < sys->num_buffers; ++i) {
+- picture_res.p_sys = calloc(1, sizeof(picture_sys_t));
+- picture_res.p_sys->owner = (vlc_object_t *)vd;
+- picture_res.p_sys->buffer = mmal_queue_get(sys->pool->queue);
+-
+- sys->pictures[i] = picture_NewFromResource(&fmt, &picture_res);
+- if (!sys->pictures[i]) {
+- msg_Err(vd, "Failed to create picture");
+- free(picture_res.p_sys);
+- goto out;
++ else
++ {
++ MMAL_BUFFER_HEADER_T *const pic_buf = hw_mmal_pic_buf_replicated(p_pic, sys->pool);
++ if (pic_buf == NULL)
++ {
++ msg_Err(vd, "Replicated buffer get fail");
++ goto fail;
+ }
+
+- sys->pictures[i]->i_planes = sys->i_planes;
+- memcpy(sys->pictures[i]->p, sys->planes, sys->i_planes * sizeof(plane_t));
+- }
+
+- memset(&picture_pool_cfg, 0, sizeof(picture_pool_configuration_t));
+- picture_pool_cfg.picture_count = sys->num_buffers;
+- picture_pool_cfg.picture = sys->pictures;
+- picture_pool_cfg.lock = mmal_picture_lock;
++ // If dimensions have chnaged then fix that
++ if (hw_mmal_vlc_pic_to_mmal_fmt_update(sys->input->format, p_pic))
++ {
++ msg_Dbg(vd, "Reset port format");
++
++ // HVS can deal with on-line dimension changes
++ if (mmal_port_format_commit(sys->input) != MMAL_SUCCESS)
++ msg_Warn(vd, "Input format commit failed");
++ }
+
+- sys->picture_pool = picture_pool_NewExtended(&picture_pool_cfg);
+- if (!sys->picture_pool) {
+- msg_Err(vd, "Failed to create picture pool");
+- goto out;
++ if ((err = mmal_port_send_buffer(sys->input, pic_buf)) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(pic_buf);
++ msg_Err(vd, "Send buffer to input failed");
++ goto fail;
++ }
+ }
+
+-out:
+- return sys->picture_pool;
+-}
+-
+-static void vd_prepare(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- picture_sys_t *pic_sys = picture->p_sys;
+-
+- if (!sys->adjust_refresh_rate || pic_sys->displayed)
+- return;
+-
+- /* Apply the required phase_offset to the picture, so that vd_display()
+- * will be called at the corrected time from the core */
+- picture->date += sys->phase_offset;
+-}
+-
+-static void vd_display(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- picture_sys_t *pic_sys = picture->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer = pic_sys->buffer;
+- MMAL_STATUS_T status;
+-
+- if (picture->format.i_frame_rate != sys->i_frame_rate ||
+- picture->format.i_frame_rate_base != sys->i_frame_rate_base ||
+- picture->b_progressive != sys->b_progressive ||
+- picture->b_top_field_first != sys->b_top_field_first) {
+- sys->b_top_field_first = picture->b_top_field_first;
+- sys->b_progressive = picture->b_progressive;
+- sys->i_frame_rate = picture->format.i_frame_rate;
+- sys->i_frame_rate_base = picture->format.i_frame_rate_base;
+- configure_display(vd, NULL, &picture->format);
+- }
+-
+- if (!pic_sys->displayed || !sys->opaque) {
+- buffer->cmd = 0;
+- buffer->length = sys->input->buffer_size;
+- buffer->user_data = picture;
+-
+- status = mmal_port_send_buffer(sys->input, buffer);
+- if (status == MMAL_SUCCESS)
+- atomic_fetch_add(&sys->buffers_in_transit, 1);
+-
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to send buffer to input port. Frame dropped");
+- picture_Release(picture);
++ {
++ unsigned int sub_no = 0;
++ MMAL_BUFFER_HEADER_T **psub_bufs2 = sys->subpic_bufs;
++ const bool is_mmal_pic = hw_mmal_pic_is_mmal(p_pic);
++
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ int rv;
++ MMAL_BUFFER_HEADER_T * const sub_buf = !is_mmal_pic ? NULL :
++ hw_mmal_pic_sub_buf_get(p_pic, sub_no);
++
++ if ((rv = hw_mmal_subpic_update(VLC_OBJECT(vd),
++ sub_buf != NULL ? sub_buf : *psub_bufs2++,
++ &sys->subs[sub_no].sub,
++ &p_pic->format,
++ &sys->dest_rect,
++ sys->display_transform,
++ p_pic->date)) == 0)
++ break;
++ else if (rv < 0)
++ goto fail;
+ }
+-
+- pic_sys->displayed = true;
+- } else {
+- picture_Release(picture);
+ }
+
+- display_subpicture(vd, subpicture);
++fail:
++ for (unsigned int i = 0; i != SUBS_MAX && sys->subpic_bufs[i] != NULL; ++i) {
++ mmal_buffer_header_release(sys->subpic_bufs[i]);
++ sys->subpic_bufs[i] = NULL;
++ }
+
+- if (subpicture)
+- subpicture_Delete(subpicture);
++ picture_Release(p_pic);
+
+ if (sys->next_phase_check == 0 && sys->adjust_refresh_rate)
+ maintain_phase_sync(vd);
+ sys->next_phase_check = (sys->next_phase_check + 1) % PHASE_CHECK_INTERVAL;
+-
+- if (sys->opaque) {
+- vlc_mutex_lock(&sys->buffer_mutex);
+- while (atomic_load(&sys->buffers_in_transit) >= MAX_BUFFERS_IN_TRANSIT)
+- vlc_cond_wait(&sys->buffer_cond, &sys->buffer_mutex);
+- vlc_mutex_unlock(&sys->buffer_mutex);
+- }
+ }
+
+ static int vd_control(vout_display_t *vd, int query, va_list args)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- vout_display_cfg_t cfg;
+- const vout_display_cfg_t *tmp_cfg;
++ vout_display_sys_t * const sys = vd->sys;
+ int ret = VLC_EGENERIC;
++ VLC_UNUSED(args);
+
+ switch (query) {
+- case VOUT_DISPLAY_CHANGE_DISPLAY_SIZE:
+- tmp_cfg = va_arg(args, const vout_display_cfg_t *);
+- if (tmp_cfg->display.width == sys->display_width &&
+- tmp_cfg->display.height == sys->display_height) {
+- cfg = *vd->cfg;
+- cfg.display.width = sys->display_width;
+- cfg.display.height = sys->display_height;
+- if (configure_display(vd, &cfg, NULL) >= 0)
+- ret = VLC_SUCCESS;
+- }
+- break;
+-
+ case VOUT_DISPLAY_CHANGE_SOURCE_ASPECT:
+ case VOUT_DISPLAY_CHANGE_SOURCE_CROP:
+- if (configure_display(vd, NULL, &vd->source) >= 0)
++ if (configure_display(vd, vd->cfg, &vd->source) >= 0)
+ ret = VLC_SUCCESS;
+ break;
+
+- case VOUT_DISPLAY_RESET_PICTURES:
+- vlc_assert_unreachable();
+ case VOUT_DISPLAY_CHANGE_ZOOM:
+- msg_Warn(vd, "Unsupported control query %d", query);
++ case VOUT_DISPLAY_CHANGE_DISPLAY_SIZE:
++ case VOUT_DISPLAY_CHANGE_DISPLAY_FILLED:
++ {
++ const vout_display_cfg_t * const cfg = va_arg(args, const vout_display_cfg_t *);
++
++ if (configure_display(vd, cfg, &vd->source) >= 0)
++ ret = VLC_SUCCESS;
++ break;
++ }
++
++ case VOUT_DISPLAY_RESET_PICTURES:
++ msg_Warn(vd, "Reset Pictures");
++ kill_pool(sys);
++ vd->fmt = vd->source; // Take (nearly) whatever source wants to give us
++ vd->fmt.i_chroma = req_chroma(vd); // Adjust chroma to something we can actaully deal with
++ ret = VLC_SUCCESS;
++ break;
++
++ case VOUT_DISPLAY_CHANGE_MMAL_HIDE:
++ {
++ MMAL_STATUS_T err;
++ unsigned int i;
++
++ msg_Dbg(vd, "Hide display");
++
++ for (i = 0; i != SUBS_MAX; ++i)
++ hw_mmal_subpic_flush(VLC_OBJECT(vd), &sys->subs[i].sub);
++
++ if (sys->input->is_enabled &&
++ (err = mmal_port_disable(sys->input)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "Unable to disable port: err=%d", err);
++ break;
++ }
++ sys->force_config = true;
++ ret = VLC_SUCCESS;
+ break;
++ }
+
+ default:
+ msg_Warn(vd, "Unknown control query %d", query);
+@@ -653,79 +971,207 @@ static int vd_control(vout_display_t *vd
+ return ret;
+ }
+
++static void set_display_windows(vout_display_t *const vd, vout_display_sys_t *const sys)
++{
++ unsigned int width, height;
++ if (query_resolution(vd, sys->display_id, &width, &height) < 0) {
++ width = vd->cfg->display.width;
++ height = vd->cfg->display.height;
++ }
++ sys->display_rect = (MMAL_RECT_T){0, 0, width, height};
++
++ sys->win_rect = (sys->req_win.width != 0) ?
++ sys->req_win :
++ is_transform_transpose(sys->display_transform) ?
++ rect_transpose(sys->display_rect) : sys->display_rect;
++}
++
+ static void vd_manage(vout_display_t *vd)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- unsigned width, height;
++ vout_display_sys_t *const sys = vd->sys;
+
+ vlc_mutex_lock(&sys->manage_mutex);
+
+ if (sys->need_configure_display) {
+- close_dmx(vd);
+- sys->dmx_handle = vc_dispmanx_display_open(0);
+-
+- if (query_resolution(vd, &width, &height) >= 0) {
+- sys->display_width = width;
+- sys->display_height = height;
+- vout_display_SendEventDisplaySize(vd, width, height);
+- }
+-
+ sys->need_configure_display = false;
++ set_display_windows(vd, sys);
+ }
+
+ vlc_mutex_unlock(&sys->manage_mutex);
+ }
+
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++
++static int attach_subpics(vout_display_t * const vd, vout_display_sys_t * const sys,
++ subpicture_t * const subpicture)
+ {
+- vout_display_t *vd = (vout_display_t *)port->userdata;
+- MMAL_STATUS_T status;
++ unsigned int n = 0;
+
+- if (buffer->cmd == MMAL_EVENT_ERROR) {
+- status = *(uint32_t *)buffer->data;
+- msg_Err(vd, "MMAL error %"PRIx32" \"%s\"", status, mmal_status_to_string(status));
++ if (sys->vzc == NULL) {
++ if ((sys->vzc = hw_mmal_vzc_pool_new()) == NULL)
++ {
++ msg_Err(vd, "Failed to allocate VZC");
++ return VLC_ENOMEM;
++ }
+ }
+
+- mmal_buffer_header_release(buffer);
++ // Attempt to import the subpics
++ for (subpicture_t * spic = subpicture; spic != NULL; spic = spic->p_next)
++ {
++ for (subpicture_region_t *sreg = spic->p_region; sreg != NULL; sreg = sreg->p_next) {
++ picture_t *const src = sreg->p_picture;
++
++#if TRACE_ALL
++ char dbuf0[5];
++ msg_Dbg(vd, " [%p:%p] Pos=%d,%d max=%dx%d, src=%dx%d/%dx%d o:%d, spu=%d,%d:%dx%d, vd->fmt=%dx%d/%dx%d, vd->source=%dx%d/%dx%d, cfg=%dx%d, zoom=%d/%d, Alpha=%d, Fmt=%s", src, src->p[0].p_pixels,
++ sreg->i_x, sreg->i_y,
++ sreg->i_max_width, sreg->i_max_height,
++ src->format.i_visible_width, src->format.i_visible_height,
++ src->format.i_width, src->format.i_height,
++ src->format.orientation,
++ sys->spu_rect.x, sys->spu_rect.y, sys->spu_rect.width, sys->spu_rect.height,
++ vd->fmt.i_visible_width, vd->fmt.i_visible_height,
++ vd->fmt.i_width, vd->fmt.i_height,
++ vd->source.i_visible_width, vd->source.i_visible_height,
++ vd->source.i_width, vd->source.i_height,
++ vd->cfg->display.width, vd->cfg->display.height,
++ vd->cfg->zoom.num, vd->cfg->zoom.den,
++ sreg->i_alpha,
++ str_fourcc(dbuf0, src->format.i_chroma));
++#endif
++
++ // At this point I think the subtitles are being placed in the
++ // coord space of the placed rectangle in the cfg display space
++ if ((sys->subpic_bufs[n] = hw_mmal_vzc_buf_from_pic(sys->vzc,
++ src,
++ (MMAL_RECT_T){.width = sys->spu_rect.width, .height=sys->spu_rect.height},
++ sreg->i_x, sreg->i_y,
++ sreg->i_alpha,
++ n == 0)) == NULL)
++ {
++ msg_Err(vd, "Failed to allocate vzc buffer for subpic");
++ return VLC_ENOMEM;
++ }
++
++ if (++n == SUBS_MAX)
++ return VLC_SUCCESS;
++ }
++ }
++ return VLC_SUCCESS;
+ }
+
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++
++static void vd_prepare(vout_display_t *vd, picture_t *p_pic,
++#if VLC_VER_3
++ subpicture_t *subpicture
++#else
++ subpicture_t *subpicture, vlc_tick_t date
++#endif
++ )
+ {
+- vout_display_t *vd = (vout_display_t *)port->userdata;
++ MMAL_STATUS_T err;
++ vout_display_sys_t * const sys = vd->sys;
++
++ vd_manage(vd);
++
++ if (!check_shape(vd, p_pic))
++ return;
++
++ if (sys->force_config ||
++ p_pic->format.i_frame_rate != sys->i_frame_rate ||
++ p_pic->format.i_frame_rate_base != sys->i_frame_rate_base ||
++ p_pic->b_progressive != sys->b_progressive ||
++ p_pic->b_top_field_first != sys->b_top_field_first)
++ {
++ sys->force_config = false;
++ sys->b_top_field_first = p_pic->b_top_field_first;
++ sys->b_progressive = p_pic->b_progressive;
++ sys->i_frame_rate = p_pic->format.i_frame_rate;
++ sys->i_frame_rate_base = p_pic->format.i_frame_rate_base;
++ configure_display(vd, NULL, &vd->source);
++ }
++
++ // Subpics can either turn up attached to the main pic or in the
++ // subpic list here - if they turn up here then process into temp
++ // buffers
++ if (subpicture != NULL) {
++ attach_subpics(vd, sys, subpicture);
++ }
++
++ // *****
++ if (want_copy(vd)) {
++ if (sys->copy_buf != NULL) {
++ msg_Err(vd, "Copy buf not NULL");
++ mmal_buffer_header_release(sys->copy_buf);
++ sys->copy_buf = NULL;
++ }
++
++ MMAL_BUFFER_HEADER_T * const buf = mmal_queue_wait(sys->copy_pool->queue);
++ // Copy 2d
++ hw_mmal_copy_pic_to_buf(buf->data, &buf->length, sys->input->format, p_pic);
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++
++ sys->copy_buf = buf;
++ }
++
++ if (isp_check(vd, sys) != MMAL_SUCCESS) {
++ return;
++ }
++
++ if (want_isp(vd))
++ {
++ struct vout_isp_conf_s * const isp = &sys->isp;
++ MMAL_BUFFER_HEADER_T * buf;
++
++ // This should be empty - make it so if it isn't
++ isp_empty_out_q(isp);
++ isp->pending = false;
++
++ // Stuff output
++ if (isp_prepare(vd, isp) != MMAL_SUCCESS)
++ return;
++
++ if ((buf = hw_mmal_pic_buf_replicated(p_pic, isp->in_pool)) == NULL)
++ {
++ msg_Err(vd, "Pic has no attached buffer");
++ return;
++ }
++
++ if ((err = mmal_port_send_buffer(isp->input, buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "Send buffer to input failed");
++ mmal_buffer_header_release(buf);
++ return;
++ }
++
++ isp->pending = true;
++ }
++
++#if 0
++ VLC_UNUSED(date);
+ vout_display_sys_t *sys = vd->sys;
+- picture_t *picture = (picture_t *)buffer->user_data;
++ picture_sys_t *pic_sys = picture->p_sys;
+
+- if (picture)
+- picture_Release(picture);
++ if (!sys->adjust_refresh_rate || pic_sys->displayed)
++ return;
+
+- vlc_mutex_lock(&sys->buffer_mutex);
+- atomic_fetch_sub(&sys->buffers_in_transit, 1);
+- vlc_cond_signal(&sys->buffer_cond);
+- vlc_mutex_unlock(&sys->buffer_mutex);
++ /* Apply the required phase_offset to the picture, so that vd_display()
++ * will be called at the corrected time from the core */
++ picture->date += sys->phase_offset;
++#endif
+ }
+
+-static int query_resolution(vout_display_t *vd, unsigned *width, unsigned *height)
++
++static void vd_control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+ {
+- TV_DISPLAY_STATE_T display_state;
+- int ret = 0;
++ vout_display_t *vd = (vout_display_t *)port->userdata;
++ MMAL_STATUS_T status;
+
+- if (vc_tv_get_display_state(&display_state) == 0) {
+- if (display_state.state & 0xFF) {
+- *width = display_state.display.hdmi.width;
+- *height = display_state.display.hdmi.height;
+- } else if (display_state.state & 0xFF00) {
+- *width = display_state.display.sdtv.width;
+- *height = display_state.display.sdtv.height;
+- } else {
+- msg_Warn(vd, "Invalid display state %"PRIx32, display_state.state);
+- ret = -1;
+- }
+- } else {
+- msg_Warn(vd, "Failed to query display resolution");
+- ret = -1;
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ msg_Err(vd, "MMAL error %"PRIx32" \"%s\"", status, mmal_status_to_string(status));
+ }
+
+- return ret;
++ mmal_buffer_header_release(buffer);
+ }
+
+ static void tvservice_cb(void *callback_data, uint32_t reason, uint32_t param1, uint32_t param2)
+@@ -780,9 +1226,9 @@ static void adjust_refresh_rate(vout_dis
+ double best_score, score;
+ int i;
+
+- vc_tv_get_display_state(&display_state);
++ vc_tv_get_display_state_id(sys->display_id, &display_state);
+ if(display_state.display.hdmi.mode != HDMI_MODE_OFF) {
+- num_modes = vc_tv_hdmi_get_supported_modes_new(display_state.display.hdmi.group,
++ num_modes = vc_tv_hdmi_get_supported_modes_new_id(sys->display_id, display_state.display.hdmi.group,
+ supported_modes, VC_TV_MAX_MODE_IDS, NULL, NULL);
+
+ for (i = 0; i < num_modes; ++i) {
+@@ -810,7 +1256,7 @@ static void adjust_refresh_rate(vout_dis
+ if((best_id >= 0) && (display_state.display.hdmi.mode != supported_modes[best_id].code)) {
+ msg_Info(vd, "Setting HDMI refresh rate to %"PRIu32,
+ supported_modes[best_id].frame_rate);
+- vc_tv_hdmi_power_on_explicit_new(HDMI_MODE_HDMI,
++ vc_tv_hdmi_power_on_explicit_new_id(sys->display_id, HDMI_MODE_HDMI,
+ supported_modes[best_id].group,
+ supported_modes[best_id].code);
+ }
+@@ -828,148 +1274,12 @@ static void adjust_refresh_rate(vout_dis
+ }
+ }
+
+-static void display_subpicture(vout_display_t *vd, subpicture_t *subpicture)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- struct dmx_region_t **dmx_region = &sys->dmx_region;
+- struct dmx_region_t *unused_dmx_region;
+- DISPMANX_UPDATE_HANDLE_T update = 0;
+- picture_t *picture;
+- video_format_t *fmt;
+- struct dmx_region_t *dmx_region_next;
+-
+- if(subpicture) {
+- subpicture_region_t *region = subpicture->p_region;
+- while(region) {
+- picture = region->p_picture;
+- fmt = &region->fmt;
+-
+- if(!*dmx_region) {
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- *dmx_region = dmx_region_new(vd, update, region);
+- } else if(((*dmx_region)->bmp_rect.width != (int32_t)fmt->i_visible_width) ||
+- ((*dmx_region)->bmp_rect.height != (int32_t)fmt->i_visible_height) ||
+- ((*dmx_region)->pos_x != region->i_x) ||
+- ((*dmx_region)->pos_y != region->i_y) ||
+- ((*dmx_region)->alpha.opacity != (uint32_t)region->i_alpha)) {
+- dmx_region_next = (*dmx_region)->next;
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- dmx_region_delete(*dmx_region, update);
+- *dmx_region = dmx_region_new(vd, update, region);
+- (*dmx_region)->next = dmx_region_next;
+- } else if((*dmx_region)->picture != picture) {
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- dmx_region_update(*dmx_region, update, picture);
+- }
+-
+- dmx_region = &(*dmx_region)->next;
+- region = region->p_next;
+- }
+- }
+-
+- /* Remove remaining regions */
+- unused_dmx_region = *dmx_region;
+- while(unused_dmx_region) {
+- dmx_region_next = unused_dmx_region->next;
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- dmx_region_delete(unused_dmx_region, update);
+- unused_dmx_region = dmx_region_next;
+- }
+- *dmx_region = NULL;
+-
+- if(update)
+- vc_dispmanx_update_submit_sync(update);
+-}
+-
+-static void close_dmx(vout_display_t *vd)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- DISPMANX_UPDATE_HANDLE_T update = vc_dispmanx_update_start(10);
+- struct dmx_region_t *dmx_region = sys->dmx_region;
+- struct dmx_region_t *dmx_region_next;
+-
+- while(dmx_region) {
+- dmx_region_next = dmx_region->next;
+- dmx_region_delete(dmx_region, update);
+- dmx_region = dmx_region_next;
+- }
+-
+- vc_dispmanx_update_submit_sync(update);
+- sys->dmx_region = NULL;
+-
+- show_background(vd, false);
+-
+- vc_dispmanx_display_close(sys->dmx_handle);
+- sys->dmx_handle = DISPMANX_NO_HANDLE;
+-}
+-
+-static struct dmx_region_t *dmx_region_new(vout_display_t *vd,
+- DISPMANX_UPDATE_HANDLE_T update, subpicture_region_t *region)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- video_format_t *fmt = &region->fmt;
+- struct dmx_region_t *dmx_region = malloc(sizeof(struct dmx_region_t));
+- uint32_t image_handle;
+-
+- dmx_region->pos_x = region->i_x;
+- dmx_region->pos_y = region->i_y;
+-
+- vc_dispmanx_rect_set(&dmx_region->bmp_rect, 0, 0, fmt->i_visible_width,
+- fmt->i_visible_height);
+- vc_dispmanx_rect_set(&dmx_region->src_rect, 0, 0, fmt->i_visible_width << 16,
+- fmt->i_visible_height << 16);
+- vc_dispmanx_rect_set(&dmx_region->dst_rect, region->i_x, region->i_y,
+- fmt->i_visible_width, fmt->i_visible_height);
+-
+- dmx_region->resource = vc_dispmanx_resource_create(VC_IMAGE_RGBA32,
+- dmx_region->bmp_rect.width | (region->p_picture->p[0].i_pitch << 16),
+- dmx_region->bmp_rect.height | (dmx_region->bmp_rect.height << 16),
+- &image_handle);
+- vc_dispmanx_resource_write_data(dmx_region->resource, VC_IMAGE_RGBA32,
+- region->p_picture->p[0].i_pitch,
+- region->p_picture->p[0].p_pixels, &dmx_region->bmp_rect);
+-
+- dmx_region->alpha.flags = DISPMANX_FLAGS_ALPHA_FROM_SOURCE | DISPMANX_FLAGS_ALPHA_MIX;
+- dmx_region->alpha.opacity = region->i_alpha;
+- dmx_region->alpha.mask = DISPMANX_NO_HANDLE;
+- dmx_region->element = vc_dispmanx_element_add(update, sys->dmx_handle,
+- sys->layer + 1, &dmx_region->dst_rect, dmx_region->resource,
+- &dmx_region->src_rect, DISPMANX_PROTECTION_NONE,
+- &dmx_region->alpha, NULL, VC_IMAGE_ROT0);
+-
+- dmx_region->next = NULL;
+- dmx_region->picture = region->p_picture;
+-
+- return dmx_region;
+-}
+-
+-static void dmx_region_update(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update, picture_t *picture)
+-{
+- vc_dispmanx_resource_write_data(dmx_region->resource, VC_IMAGE_RGBA32,
+- picture->p[0].i_pitch, picture->p[0].p_pixels, &dmx_region->bmp_rect);
+- vc_dispmanx_element_change_source(update, dmx_region->element, dmx_region->resource);
+- dmx_region->picture = picture;
+-}
+-
+-static void dmx_region_delete(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update)
+-{
+- vc_dispmanx_element_remove(update, dmx_region->element);
+- vc_dispmanx_resource_delete(dmx_region->resource);
+- free(dmx_region);
+-}
+-
+ static void maintain_phase_sync(vout_display_t *vd)
+ {
+ MMAL_PARAMETER_VIDEO_RENDER_STATS_T render_stats = {
+ .hdr = { MMAL_PARAMETER_VIDEO_RENDER_STATS, sizeof(render_stats) },
+ };
+- int32_t frame_duration = 1000000 /
++ int32_t frame_duration = CLOCK_FREQ /
+ ((double)vd->sys->i_frame_rate /
+ vd->sys->i_frame_rate_base);
+ vout_display_sys_t *sys = vd->sys;
+@@ -1012,32 +1322,436 @@ static void maintain_phase_sync(vout_dis
+ }
+ }
+
+-static void show_background(vout_display_t *vd, bool enable)
++static void CloseMmalVout(vlc_object_t *object)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- uint32_t image_ptr, color = 0xFF000000;
+- VC_RECT_T dst_rect, src_rect;
+- DISPMANX_UPDATE_HANDLE_T update;
+-
+- if (enable && !sys->bkg_element) {
+- sys->bkg_resource = vc_dispmanx_resource_create(VC_IMAGE_RGBA32, 1, 1,
+- &image_ptr);
+- vc_dispmanx_rect_set(&dst_rect, 0, 0, 1, 1);
+- vc_dispmanx_resource_write_data(sys->bkg_resource, VC_IMAGE_RGBA32,
+- sizeof(color), &color, &dst_rect);
+- vc_dispmanx_rect_set(&src_rect, 0, 0, 1 << 16, 1 << 16);
+- vc_dispmanx_rect_set(&dst_rect, 0, 0, 0, 0);
+- update = vc_dispmanx_update_start(0);
+- sys->bkg_element = vc_dispmanx_element_add(update, sys->dmx_handle,
+- sys->layer - 1, &dst_rect, sys->bkg_resource, &src_rect,
+- DISPMANX_PROTECTION_NONE, NULL, NULL, VC_IMAGE_ROT0);
+- vc_dispmanx_update_submit_sync(update);
+- } else if (!enable && sys->bkg_element) {
+- update = vc_dispmanx_update_start(0);
+- vc_dispmanx_element_remove(update, sys->bkg_element);
+- vc_dispmanx_resource_delete(sys->bkg_resource);
+- vc_dispmanx_update_submit_sync(update);
+- sys->bkg_element = DISPMANX_NO_HANDLE;
+- sys->bkg_resource = DISPMANX_NO_HANDLE;
++ vout_display_t * const vd = (vout_display_t *)object;
++ vout_display_sys_t * const sys = vd->sys;
++ char response[20]; /* answer is hvs_update_fields=%1d */
++
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++
++ kill_pool(sys);
++
++ vc_tv_unregister_callback_full(tvservice_cb, vd);
++
++ // Shouldn't be anything here - but just in case
++ for (unsigned int i = 0; i != SUBS_MAX; ++i)
++ if (sys->subpic_bufs[i] != NULL)
++ mmal_buffer_header_release(sys->subpic_bufs[i]);
++
++ for (unsigned int i = 0; i != SUBS_MAX; ++i) {
++ vout_subpic_t * const sub = sys->subs + i;
++ if (sub->component != NULL) {
++ hw_mmal_subpic_close(VLC_OBJECT(vd), &sub->sub);
++ if (sub->component->control->is_enabled)
++ mmal_port_disable(sub->component->control);
++ if (sub->component->is_enabled)
++ mmal_component_disable(sub->component);
++ mmal_component_release(sub->component);
++ sub->component = NULL;
++ }
+ }
++
++ if (sys->input && sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->component && sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
++
++ if (sys->copy_buf != NULL)
++ mmal_buffer_header_release(sys->copy_buf);
++
++ if (sys->input != NULL && sys->copy_pool != NULL)
++ mmal_port_pool_destroy(sys->input, sys->copy_pool);
++
++ if (sys->component && sys->component->is_enabled)
++ mmal_component_disable(sys->component);
++
++ if (sys->pool)
++ mmal_pool_destroy(sys->pool);
++
++ if (sys->component)
++ mmal_component_release(sys->component);
++
++ isp_close(vd, sys);
++
++ hw_mmal_vzc_pool_release(sys->vzc);
++
++ vlc_mutex_destroy(&sys->manage_mutex);
++
++ if (sys->native_interlaced) {
++ if (vc_gencmd(response, sizeof(response), "hvs_update_fields 0") < 0 ||
++ response[18] != '0')
++ msg_Warn(vd, "Could not reset hvs field mode");
++ }
++
++ cma_vcsm_exit(sys->init_type);;
++
++ free(sys);
++
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s", __func__);
++#endif
++}
++
++
++static const struct {
++ const char * name;
++ int num;
++} display_name_to_num[] = {
++ {"auto", -1},
++ {"hdmi-1", DISPMANX_ID_HDMI0},
++ {"hdmi-2", DISPMANX_ID_HDMI1},
++ {NULL, -2}
++};
++
++static const struct {
++ const char * name;
++ int transform_num;
++} transform_name_to_num[] = {
++ {"auto", -1},
++ {"0", MMAL_DISPLAY_ROT0},
++ {"hflip", MMAL_DISPLAY_MIRROR_ROT0},
++ {"vflip", MMAL_DISPLAY_MIRROR_ROT180},
++ {"180", MMAL_DISPLAY_ROT180},
++ {"transpose", MMAL_DISPLAY_MIRROR_ROT90},
++ {"270", MMAL_DISPLAY_ROT270},
++ {"90", MMAL_DISPLAY_ROT90},
++ {"antitranspose", MMAL_DISPLAY_MIRROR_ROT270},
++ {NULL, -2}
++};
++
++static int find_display_num(const char * const name)
++{
++ unsigned int i;
++ for (i = 0; display_name_to_num[i].name != NULL && strcasecmp(display_name_to_num[i].name, name) != 0; ++i)
++ /* Loop */;
++ return display_name_to_num[i].num;
++}
++
++static int find_transform_num(const char * const name)
++{
++ unsigned int i;
++ for (i = 0; transform_name_to_num[i].name != NULL && strcasecmp(transform_name_to_num[i].name, name) != 0; ++i)
++ /* Loop */;
++ return transform_name_to_num[i].transform_num;
++}
++
++#if HAVE_X11_XLIB_H
++#include <X11/Xlib.h>
++#include <X11/extensions/Xrandr.h>
++static MMAL_DISPLAYTRANSFORM_T get_xrandr_rotation(vout_display_t * const vd)
++{
++ Display * const x = XOpenDisplay(NULL);
++ Rotation cur_rot = 0;
++ MMAL_DISPLAYTRANSFORM_T trans;
++
++ if (x == NULL)
++ return MMAL_DISPLAY_ROT0;
++
++ XRRRotations(x, 0, &cur_rot);
++ XCloseDisplay(x);
++
++ // Convert to MMAL
++ // xrandr seems to rotate the other way to mmal
++
++ switch (cur_rot)
++ {
++ case 0:
++ case RR_Rotate_0:
++ trans = MMAL_DISPLAY_ROT0;
++ break;
++ case RR_Rotate_90:
++ trans = MMAL_DISPLAY_ROT270;
++ break;
++ case RR_Rotate_180:
++ trans = MMAL_DISPLAY_ROT180;
++ break;
++ case RR_Rotate_270:
++ trans = MMAL_DISPLAY_ROT90;
++ break;
++ case RR_Reflect_X:
++ trans = MMAL_DISPLAY_MIRROR_ROT0;
++ break;
++ case RR_Reflect_Y:
++ trans = MMAL_DISPLAY_MIRROR_ROT180;
++ break;
++ default:
++ msg_Info(vd, "Unexpected X rotation value: %#x", cur_rot);
++ trans = MMAL_DISPLAY_ROT0;
++ break;
++ }
++
++ return trans;
++}
++#else
++static MMAL_DISPLAYTRANSFORM_T get_xrandr_rotation(vout_display_t * const vd)
++{
++ VLC_UNUSED(vd);
++ return MMAL_DISPLAY_ROT0;
++}
++#endif
++
++static MMAL_RECT_T str_to_rect(const char * s)
++{
++ MMAL_RECT_T rect = {0};
++ rect.width = strtoul(s, (char**)&s, 0);
++ if (*s == '\0')
++ return rect;
++ if (*s++ != 'x')
++ goto fail;
++ rect.height = strtoul(s, (char**)&s, 0);
++ if (*s == '\0')
++ return rect;
++ if (*s++ != '+')
++ goto fail;
++ rect.x = strtoul(s, (char**)&s, 0);
++ if (*s == '\0')
++ return rect;
++ if (*s++ != '+')
++ goto fail;
++ rect.y = strtoul(s, (char**)&s, 0);
++ if (*s != '\0')
++ goto fail;
++ return rect;
++
++fail:
++ return (MMAL_RECT_T){0,0,0,0};
++}
++
++static int OpenMmalVout(vlc_object_t *object)
++{
++ vout_display_t *vd = (vout_display_t *)object;
++ vout_display_sys_t *sys;
++ MMAL_STATUS_T status;
++ int ret = VLC_EGENERIC;
++ // At the moment all copy is via I420
++ const bool needs_copy = !hw_mmal_chroma_is_mmal(vd->fmt.i_chroma);
++ const MMAL_FOURCC_T enc_in = needs_copy ? MMAL_ENCODING_I420 :
++ vout_vlc_to_mmal_pic_fourcc(vd->fmt.i_chroma);
++
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s: o:%d", __func__, (int)vd->fmt.orientation);
++#endif
++
++ get_xrandr_rotation(vd);
++
++ sys = calloc(1, sizeof(struct vout_display_sys_t));
++ if (!sys)
++ return VLC_ENOMEM;
++ vd->sys = sys;
++
++ vlc_mutex_init(&sys->manage_mutex);
++
++ if ((sys->init_type = cma_vcsm_init()) == VCSM_INIT_NONE)
++ {
++ msg_Err(vd, "VCSM init fail");
++ goto fail;
++ }
++
++ vc_tv_register_callback(tvservice_cb, vd);
++
++ sys->layer = var_InheritInteger(vd, MMAL_LAYER_NAME);
++ sys->transparent = var_InheritBool(vd, MMAL_VOUT_TRANSPARENT_NAME);
++
++ {
++ const char *display_name = var_InheritString(vd, MMAL_DISPLAY_NAME);
++ int qt_num = var_InheritInteger(vd, "qt-fullscreen-screennumber" );
++ int display_id = find_display_num(display_name);
++// sys->display_id = display_id < 0 ? vc_tv_get_default_display_id() : display_id;
++ sys->display_id = display_id >= 0 ? display_id :
++ qt_num == 1 ? DISPMANX_ID_HDMI1 : DISPMANX_ID_HDMI;
++ if (display_id < -1)
++ msg_Warn(vd, "Unknown display device: '%s'", display_name);
++ else
++ msg_Dbg(vd, "Display device: %s, qt=%d id=%d display=%d", display_name,
++ qt_num, display_id, sys->display_id);
++ }
++
++ {
++ const char *window_str = var_InheritString(vd, MMAL_VOUT_WINDOW_NAME);
++ sys->req_win = str_to_rect(window_str);
++ if (sys->req_win.width != 0)
++ msg_Dbg(vd, "Window: %dx%d @ %d,%d",
++ sys->req_win.width, sys->req_win.height,
++ sys->req_win.x, sys->req_win.y);
++ }
++
++ {
++ const char *transform_name = var_InheritString(vd, MMAL_VOUT_TRANSFORM_NAME);
++ int transform_num = find_transform_num(transform_name);
++ sys->display_transform = transform_num < 0 ?
++ get_xrandr_rotation(vd) :
++ (MMAL_DISPLAYTRANSFORM_T)transform_num;
++
++ if (transform_num < -1)
++ msg_Warn(vd, "Unknown vout transform: '%s'", transform_name);
++ else
++ msg_Dbg(vd, "Display transform: %s, mmal_display_transform=%d",
++ transform_name, (int)sys->display_transform);
++
++ sys->video_transform = combine_transform(
++ vlc_to_mmal_transform(vd->fmt.orientation), sys->display_transform);
++ sys->dest_transform = transform_inverse(sys->display_transform);
++ }
++
++ status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++ status = mmal_port_enable(sys->component->control, vd_control_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->input = sys->component->input[0];
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++
++ sys->input->format->encoding = enc_in;
++ sys->input->format->encoding_variant = 0;
++ sys->i_planes = 1;
++
++ display_set_format(vd, sys->input->format, want_isp(vd));
++
++ status = port_parameter_set_bool(sys->input, MMAL_PARAMETER_ZERO_COPY, true);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ status = mmal_port_format_commit(sys->input);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++
++ if (!needs_copy) {
++ sys->input->buffer_num = 30;
++ }
++ else {
++ sys->input->buffer_num = 2;
++ if ((sys->copy_pool = mmal_port_pool_create(sys->input, 2, sys->input->buffer_size)) == NULL)
++ {
++ msg_Err(vd, "Cannot create copy pool");
++ goto fail;
++ }
++ }
++
++ set_display_windows(vd, sys);
++
++ configure_display(vd, vd->cfg, &vd->source);
++
++ status = mmal_port_enable(sys->input, vd_input_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((sys->pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(vd, "Failed to create input pool");
++ goto fail;
++ }
++
++ for (unsigned int i = 0; i != SUBS_MAX; ++i) {
++ vout_subpic_t * const sub = sys->subs + i;
++ if ((status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &sub->component)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(vd, "Failed to create subpic component %d", i);
++ goto fail;
++ }
++ sub->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++ if ((status = mmal_port_enable(sub->component->control, vd_control_port_cb)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable control port %s on sub %d (status=%"PRIx32" %s)",
++ sys->component->control->name, i, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ if ((status = hw_mmal_subpic_open(VLC_OBJECT(vd), &sub->sub, sub->component->input[0],
++ sys->display_id, sys->layer + i + 1)) != MMAL_SUCCESS) {
++ msg_Dbg(vd, "Failed to open subpic %d", i);
++ goto fail;
++ }
++ if ((status = mmal_component_enable(sub->component)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(vd, "Failed to enable subpic component %d", i);
++ goto fail;
++ }
++ }
++
++ // If we can't deal with it directly ask for I420
++ vd->fmt.i_chroma = req_chroma(vd);
++
++ vd->info = (vout_display_info_t){
++ .is_slow = false,
++ .has_double_click = false,
++ .needs_hide_mouse = false,
++ .has_pictures_invalid = true,
++ .subpicture_chromas = hw_mmal_vzc_subpicture_chromas
++ };
++
++ vd->pool = vd_pool;
++ vd->prepare = vd_prepare;
++ vd->display = vd_display;
++ vd->control = vd_control;
++
++
++ msg_Dbg(vd, ">>> %s: ok", __func__);
++ return VLC_SUCCESS;
++
++fail:
++ CloseMmalVout(object);
++
++ msg_Dbg(vd, ">>> %s: rv=%d", __func__, ret);
++
++ return ret == VLC_SUCCESS ? VLC_EGENERIC : ret;
+ }
++
++vlc_module_begin()
++
++ add_submodule()
++
++ set_shortname(N_("MMAL vout"))
++ set_description(N_("MMAL-based vout plugin for Raspberry Pi"))
++ set_capability("vout display", 16) // 1 point better than ASCII art
++ add_shortcut("mmal_vout")
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VOUT )
++
++ add_integer(MMAL_LAYER_NAME, 1, MMAL_LAYER_TEXT, MMAL_LAYER_LONGTEXT, false)
++ add_bool(MMAL_ADJUST_REFRESHRATE_NAME, false, MMAL_ADJUST_REFRESHRATE_TEXT,
++ MMAL_ADJUST_REFRESHRATE_LONGTEXT, false)
++ add_bool(MMAL_NATIVE_INTERLACED, false, MMAL_NATIVE_INTERLACE_TEXT,
++ MMAL_NATIVE_INTERLACE_LONGTEXT, false)
++ add_string(MMAL_DISPLAY_NAME, "auto", MMAL_DISPLAY_TEXT,
++ MMAL_DISPLAY_LONGTEXT, false)
++ add_string(MMAL_VOUT_TRANSFORM_NAME, "auto", MMAL_VOUT_TRANSFORM_TEXT,
++ MMAL_VOUT_TRANSFORM_LONGTEXT, false)
++ add_string(MMAL_VOUT_WINDOW_NAME, "fullscreen", MMAL_VOUT_WINDOW_TEXT,
++ MMAL_VOUT_WINDOW_LONGTEXT, false)
++ add_bool(MMAL_VOUT_TRANSPARENT_NAME, false, MMAL_VOUT_TRANSPARENT_TEXT,
++ MMAL_VOUT_TRANSPARENT_LONGTEXT, false)
++ set_callbacks(OpenMmalVout, CloseMmalVout)
++
++vlc_module_end()
++
++
+--- /dev/null
++++ b/modules/hw/mmal/xsplitter.c
+@@ -0,0 +1,584 @@
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <stdatomic.h>
++
++#include <vlc_common.h>
++#include <vlc_plugin.h>
++#include <vlc_threads.h>
++#include <vlc_vout_display.h>
++#include <vlc_modules.h>
++
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++
++#include "mmal_picture.h"
++
++#define TRACE_ALL 0
++
++typedef struct display_desc_s
++{
++ vout_display_t * vout;
++ unsigned int max_pels;
++} display_desc_t;
++
++typedef struct mmal_x11_sys_s
++{
++ bool use_mmal;
++ display_desc_t * cur_desc;
++ display_desc_t mmal_desc;
++ display_desc_t x_desc;
++ uint32_t changed;
++ vlc_fourcc_t subpicture_chromas[16];
++} mmal_x11_sys_t;
++
++#define MAX_GL_PELS (1920*1080)
++#define MAX_MMAL_PELS (4096*4096) // Should never be hit
++
++#if 0
++// Gen prog for the following table
++// Not done inline in case we end up pulling in FP libs we don't want
++#include <math.h>
++#include <stdio.h>
++
++int main(int argc, char *argv[])
++{
++ unsigned int i;
++ for (i = 0; i != 64; ++i)
++ {
++ printf(" [%2u]=%5u,", i, (unsigned int)(0.5 + (1/sqrt((i + 5)/4.0) * 65536.0)));
++ if (i % 4 == 3)
++ printf("\n");
++ }
++}
++#endif
++
++static const uint16_t sqrt_tab[64] = {
++ [ 0]=58617, [ 1]=53510, [ 2]=49541, [ 3]=46341,
++ [ 4]=43691, [ 5]=41449, [ 6]=39520, [ 7]=37837,
++ [ 8]=36353, [ 9]=35030, [10]=33843, [11]=32768,
++ [12]=31790, [13]=30894, [14]=30070, [15]=29309,
++ [16]=28602, [17]=27945, [18]=27330, [19]=26755,
++ [20]=26214, [21]=25705, [22]=25225, [23]=24770,
++ [24]=24339, [25]=23930, [26]=23541, [27]=23170,
++ [28]=22817, [29]=22479, [30]=22155, [31]=21845,
++ [32]=21548, [33]=21263, [34]=20988, [35]=20724,
++ [36]=20470, [37]=20225, [38]=19988, [39]=19760,
++ [40]=19539, [41]=19326, [42]=19119, [43]=18919,
++ [44]=18725, [45]=18536, [46]=18354, [47]=18176,
++ [48]=18004, [49]=17837, [50]=17674, [51]=17515,
++ [52]=17361, [53]=17211, [54]=17064, [55]=16921,
++ [56]=16782, [57]=16646, [58]=16514, [59]=16384,
++ [60]=16257, [61]=16134, [62]=16013, [63]=15895
++};
++#define SQRT_MAX (sizeof(sqrt_tab)/sizeof(sqrt_tab[0]) - 1)
++
++static bool cpy_fmt_limit_size(const display_desc_t * const dd,
++ video_format_t * const dst,
++ const video_format_t * const src)
++{
++ const unsigned int src_pel = src->i_visible_width * src->i_visible_height;
++
++ *dst = *src;
++
++ if (src_pel <= dd->max_pels)
++ return false;
++
++ // scaling factor sqrt(max_pel/cur_pel)
++ // sqrt done by lookup & 16 bit fixed-point maths - not exactly accurate but
++ // easily good enough & avoids floating point (which may be slow)
++ // src_pel > max_pel so n >= 0
++ // Rounding should be such that exact sqrts work and everything else rounds
++ // down
++ unsigned int n = ((src_pel * 4 - 1) / dd->max_pels) - 4;
++ unsigned int scale = sqrt_tab[n >= SQRT_MAX ? SQRT_MAX : n];
++
++ // Rescale width - rounding up to 16
++ unsigned int width = ((src->i_visible_width * scale + (16 << 16) - 1) >> 16) & ~15;
++ // Rescale height based on new width
++ unsigned int height = (src->i_visible_height * width + src->i_visible_width/2) / src->i_visible_width;
++
++// fprintf(stderr, "%dx%d -> %dx%d\n", src->i_visible_width, src->i_visible_height, width, height);
++
++ dst->i_width = width;
++ dst->i_visible_width = width;
++ dst->i_height = height;
++ dst->i_visible_height = height;
++ return true;
++}
++
++static void unload_display_module(vout_display_t * const x_vout)
++{
++ if (x_vout != NULL) {
++ if (x_vout->module != NULL) {
++ module_unneed(x_vout, x_vout->module);
++ }
++ vlc_object_release(x_vout);
++ }
++}
++
++static void CloseMmalX11(vlc_object_t *object)
++{
++ vout_display_t * const vd = (vout_display_t *)object;
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++
++ msg_Dbg(vd, "<<< %s", __func__);
++
++ if (sys == NULL)
++ return;
++
++ unload_display_module(sys->x_desc.vout);
++
++ unload_display_module(sys->mmal_desc.vout);
++
++ free(sys);
++
++ msg_Dbg(vd, ">>> %s", __func__);
++}
++
++static void mmal_x11_event(vout_display_t * x_vd, int cmd, va_list args)
++{
++ vout_display_t * const vd = x_vd->owner.sys;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s (cmd=%d)", __func__, cmd);
++#endif
++
++ // Do not fall into the display assert if Invalid not supported
++ if (cmd == VOUT_DISPLAY_EVENT_PICTURES_INVALID &&
++ !vd->info.has_pictures_invalid)
++ return;
++
++ vd->owner.event(vd, cmd, args);
++}
++
++static vout_window_t * mmal_x11_window_new(vout_display_t * x_vd, unsigned type)
++{
++ vout_display_t * const vd = x_vd->owner.sys;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s (type=%d)", __func__, type);
++#endif
++ return vd->owner.window_new(vd, type);
++}
++
++static void mmal_x11_window_del(vout_display_t * x_vd, vout_window_t * win)
++{
++ vout_display_t * const vd = x_vd->owner.sys;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++ vd->owner.window_del(vd, win);
++}
++
++
++static int load_display_module(vout_display_t * const vd,
++ display_desc_t * const dd,
++ const char * const cap,
++ const char * const module_name)
++{
++ vout_display_t * const x_vout = vlc_object_create(vd, sizeof(*x_vout));
++
++ dd->vout = NULL;
++ if (!x_vout)
++ return -1;
++
++ x_vout->owner.sys = vd;
++ x_vout->owner.event = mmal_x11_event;
++ x_vout->owner.window_new = mmal_x11_window_new;
++ x_vout->owner.window_del = mmal_x11_window_del;
++
++ x_vout->cfg = vd->cfg;
++ x_vout->info = vd->info;
++ cpy_fmt_limit_size(dd, &x_vout->source, &vd->source);
++ cpy_fmt_limit_size(dd, &x_vout->fmt, &vd->fmt);
++
++ if ((x_vout->module = module_need(x_vout, cap, module_name, true)) == NULL)
++ {
++ msg_Err(vd, "Failed to open Xsplitter:%s module", module_name);
++ goto fail;
++ }
++
++ msg_Dbg(vd, "R/G/B: %08x/%08x/%08x", x_vout->fmt.i_rmask, x_vout->fmt.i_gmask, x_vout->fmt.i_bmask);
++
++ dd->vout = x_vout;
++ return 0;
++
++fail:
++ vlc_object_release(x_vout);
++ return -1;
++}
++
++
++/* Return a pointer over the current picture_pool_t* (mandatory).
++ *
++ * For performance reasons, it is best to provide at least count
++ * pictures but it is not mandatory.
++ * You can return NULL when you cannot/do not want to allocate
++ * pictures.
++ * The vout display module keeps the ownership of the pool and can
++ * destroy it only when closing or on invalid pictures control.
++ */
++static picture_pool_t * mmal_x11_pool(vout_display_t * vd, unsigned count)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++#if TRACE_ALL
++ char buf0[5];
++ msg_Dbg(vd, "<<< %s (count=%d) %s:%dx%d->%s:%dx%d", __func__, count,
++ str_fourcc(buf0, vd->fmt.i_chroma),
++ vd->fmt.i_width, vd->fmt.i_height,
++ str_fourcc(buf0, x_vd->fmt.i_chroma),
++ x_vd->fmt.i_width, x_vd->fmt.i_height);
++#endif
++ picture_pool_t * pool = x_vd->pool(x_vd, count);
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s: %p", __func__, pool);
++#endif
++ return pool;
++}
++
++/* Prepare a picture and an optional subpicture for display (optional).
++ *
++ * It is called before the next pf_display call to provide as much
++ * time as possible to prepare the given picture and the subpicture
++ * for display.
++ * You are guaranted that pf_display will always be called and using
++ * the exact same picture_t and subpicture_t.
++ * You cannot change the pixel content of the picture_t or of the
++ * subpicture_t.
++ */
++static void mmal_x11_prepare(vout_display_t * vd, picture_t * pic, subpicture_t * sub)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++ if (x_vd->prepare)
++ x_vd->prepare(x_vd, pic, sub);
++}
++
++/* Display a picture and an optional subpicture (mandatory).
++ *
++ * The picture and the optional subpicture must be displayed as soon as
++ * possible.
++ * You cannot change the pixel content of the picture_t or of the
++ * subpicture_t.
++ *
++ * This function gives away the ownership of the picture and of the
++ * subpicture, so you must release them as soon as possible.
++ */
++static void mmal_x11_display(vout_display_t * vd, picture_t * pic, subpicture_t * sub)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++
++#if TRACE_ALL
++ const bool is_mmal_pic = hw_mmal_pic_is_mmal(pic);
++ msg_Dbg(vd, "<<< %s: fmt: %dx%d/%dx%d, pic:%dx%d, pts=%lld, mmal=%d/%d", __func__, vd->fmt.i_width, vd->fmt.i_height, x_vd->fmt.i_width, x_vd->fmt.i_height, pic->format.i_width, pic->format.i_height, (long long)pic->date,
++ is_mmal_pic, sys->use_mmal);
++#endif
++
++ if (x_vd->fmt.i_chroma != pic->format.i_chroma ||
++ x_vd->fmt.i_width != pic->format.i_width ||
++ x_vd->fmt.i_height != pic->format.i_height)
++ {
++ msg_Dbg(vd, "%s: Picture dropped", __func__);
++ picture_Release(pic);
++ if (sub != NULL)
++ subpicture_Delete(sub);
++ return;
++ }
++
++ x_vd->display(x_vd, pic, sub);
++}
++
++
++static int vout_display_Control(const display_desc_t * const dd, int query, ...)
++{
++ va_list args;
++ int result;
++
++ va_start(args, query);
++ result = dd->vout->control(dd->vout, query, args);
++ va_end(args);
++
++ return result;
++}
++
++static bool want_mmal_vout(vout_display_t * const vd, const mmal_x11_sys_t * const sys)
++{
++ return sys->mmal_desc.vout != NULL &&
++ (sys->x_desc.vout == NULL || var_InheritBool(vd, "fullscreen"));
++}
++
++static inline int
++up_rv(const int a, const int b)
++{
++ return a != 0 ? a : b;
++}
++
++static int
++reset_pictures(vout_display_t * const vd, const display_desc_t * const desc)
++{
++ int rv = 0;
++ VLC_UNUSED(vd);
++ if (desc->vout)
++ {
++ // If the display doesn't have has_pictures_invalid then it doesn't
++ // expect RESET_PICTURES
++ if (desc->vout->info.has_pictures_invalid)
++ vout_display_Control(desc, VOUT_DISPLAY_RESET_PICTURES);
++ }
++ return rv;
++}
++
++static int
++replay_controls(vout_display_t * const vd, const display_desc_t * const desc, const int32_t changed)
++{
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_DISPLAY_FILLED)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_DISPLAY_FILLED, vd->cfg);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_ZOOM)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_ZOOM, vd->cfg);
++ if ((changed & ((1 << VOUT_DISPLAY_CHANGE_SOURCE_CROP) |
++ (1 << VOUT_DISPLAY_CHANGE_SOURCE_ASPECT))) != 0)
++ cpy_fmt_limit_size(desc, &desc->vout->source, &vd->source);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_SOURCE_ASPECT)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_SOURCE_ASPECT);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_SOURCE_CROP)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_SOURCE_CROP);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_VIEWPOINT)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_VIEWPOINT, vd->cfg);
++ return 0;
++}
++
++/* Control on the module (mandatory) */
++static int mmal_x11_control(vout_display_t * vd, int ctl, va_list va)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ display_desc_t *x_desc = sys->cur_desc;
++ int rv;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s[%d] (ctl=%d)", __func__, sys->use_mmal, ctl);
++#endif
++ // Remember what we've told this vd - unwanted ctls ignored on replay
++ if (ctl >= 0 && ctl <= 31)
++ sys->changed |= (1 << ctl);
++
++ switch (ctl) {
++ case VOUT_DISPLAY_CHANGE_DISPLAY_SIZE:
++ {
++ const vout_display_cfg_t * const cfg = va_arg(va, const vout_display_cfg_t *);
++ const bool want_mmal = want_mmal_vout(vd, sys);
++ const bool swap_vout = (sys->use_mmal != want_mmal);
++ display_desc_t * const new_desc = want_mmal ? &sys->mmal_desc : &sys->x_desc;
++
++ msg_Dbg(vd, "Change size: %d, %d: mmal_vout=%p, want_mmal=%d, fs=%d",
++ cfg->display.width, cfg->display.height, sys->mmal_desc.vout, want_mmal,
++ var_InheritBool(vd, "fullscreen"));
++
++ // Repeat any control calls that we sent to the previous vd
++ if (swap_vout && sys->changed != 0) {
++ const uint32_t changed = sys->changed;
++ sys->changed = 0;
++ replay_controls(vd, new_desc, changed);
++ }
++
++ if (swap_vout) {
++ if (sys->use_mmal) {
++ vout_display_Control(x_desc, VOUT_DISPLAY_CHANGE_MMAL_HIDE);
++ }
++ vout_display_SendEventPicturesInvalid(vd);
++ }
++
++ rv = vout_display_Control(new_desc, ctl, cfg);
++ if (rv == VLC_SUCCESS) {
++ vd->fmt = new_desc->vout->fmt;
++ sys->cur_desc = new_desc;
++ sys->use_mmal = want_mmal;
++ }
++
++
++ break;
++ }
++
++ case VOUT_DISPLAY_RESET_PICTURES:
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5];
++ msg_Dbg(vd, "<<< %s: Pic reset: fmt: %s,%dx%d<-%s,%dx%d, source: %s,%dx%d/%dx%d", __func__,
++ str_fourcc(dbuf0, vd->fmt.i_chroma), vd->fmt.i_width, vd->fmt.i_height,
++ str_fourcc(dbuf1, x_desc->vout->fmt.i_chroma), x_desc->vout->fmt.i_width, x_desc->vout->fmt.i_height,
++ str_fourcc(dbuf2, vd->source.i_chroma), vd->source.i_width, vd->source.i_height, x_desc->vout->source.i_width,
++ x_desc->vout->source.i_height);
++ }
++ rv = reset_pictures(vd, &sys->x_desc);
++ rv = up_rv(rv, reset_pictures(vd, &sys->mmal_desc));
++
++ vd->fmt = x_desc->vout->fmt;
++ break;
++
++ case VOUT_DISPLAY_CHANGE_SOURCE_ASPECT:
++ case VOUT_DISPLAY_CHANGE_SOURCE_CROP:
++ cpy_fmt_limit_size(x_desc, &x_desc->vout->source, &vd->source);
++
++ /* FALLTHRU */
++ default:
++ rv = x_desc->vout->control(x_desc->vout, ctl, va);
++// vd->fmt = x_vd->fmt;
++ break;
++ }
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s (rv=%d)", __func__, rv);
++#endif
++ return rv;
++}
++
++#define DO_MANAGE 0
++
++#if DO_MANAGE
++/* Manage pending event (optional) */
++static void mmal_x11_manage(vout_display_t * vd)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++ x_vd->manage(x_vd);
++}
++#endif
++
++static int OpenMmalX11(vlc_object_t *object)
++{
++ vout_display_t * const vd = (vout_display_t *)object;
++ mmal_x11_sys_t * const sys = calloc(1, sizeof(*sys));
++ int ret = VLC_SUCCESS;
++
++ if (sys == NULL) {
++ return VLC_EGENERIC;
++ }
++ vd->sys = (vout_display_sys_t *)sys;
++
++ vd->info = (vout_display_info_t){
++ .is_slow = false,
++ .has_double_click = false,
++ .needs_hide_mouse = false,
++ .has_pictures_invalid = true,
++ .subpicture_chromas = NULL
++ };
++
++ {
++ char dbuf0[5];
++ msg_Dbg(vd, ">>> %s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d", __func__,
++ str_fourcc(dbuf0, vd->fmt.i_chroma),
++ vd->fmt.i_width, vd->fmt.i_height,
++ vd->fmt.i_x_offset, vd->fmt.i_y_offset,
++ vd->fmt.i_visible_width, vd->fmt.i_visible_height,
++ vd->fmt.i_sar_num, vd->fmt.i_sar_den);
++ }
++
++ sys->x_desc.max_pels = MAX_GL_PELS;
++ sys->mmal_desc.max_pels = MAX_MMAL_PELS;
++
++ if (load_display_module(vd, &sys->x_desc, "vout display", "opengles2") == 0)
++ {
++ msg_Dbg(vd, "Opengles2 output found");
++ }
++ else
++ {
++ sys->x_desc.max_pels = MAX_MMAL_PELS;
++ if (load_display_module(vd, &sys->x_desc, "vout display", "xcb_x11") == 0)
++ msg_Dbg(vd, "X11 XCB output found");
++ }
++
++ if ((load_display_module(vd, &sys->mmal_desc, "vout display", "mmal_vout")) == 0)
++ msg_Dbg(vd, "MMAL output found");
++
++ if (sys->mmal_desc.vout == NULL && sys->x_desc.vout == NULL) {
++ char dbuf0[5], dbuf1[5];
++ msg_Info(vd, "No valid output found for vout (%s/%s)", str_fourcc(dbuf0, vd->fmt.i_chroma), str_fourcc(dbuf1, vd->source.i_chroma));
++ goto fail;
++ }
++
++ vd->pool = mmal_x11_pool;
++ vd->prepare = mmal_x11_prepare;
++ vd->display = mmal_x11_display;
++ vd->control = mmal_x11_control;
++#if DO_MANAGE
++ vd->manage = mmal_x11_manage;
++#endif
++
++ if (want_mmal_vout(vd, sys)) {
++ sys->cur_desc = &sys->mmal_desc;
++ sys->use_mmal = true;
++ }
++ else {
++ sys->cur_desc = &sys->x_desc;
++ sys->use_mmal = false;
++ }
++
++ if (sys->mmal_desc.vout == NULL || sys->x_desc.vout == NULL) {
++ vd->info = sys->cur_desc->vout->info;
++ vd->info.has_pictures_invalid = true; // Should make this unwanted
++ }
++ else {
++ // We have both - construct a combination
++ vd->info = (vout_display_info_t){
++ .is_slow = false,
++ .has_double_click = sys->mmal_desc.vout->info.has_double_click || sys->x_desc.vout->info.has_double_click,
++ .needs_hide_mouse = sys->mmal_desc.vout->info.needs_hide_mouse || sys->x_desc.vout->info.needs_hide_mouse,
++ .has_pictures_invalid = true,
++ };
++ // Construct intersection of subpicture chromas
++ // sys calloced so no need to add the terminating zero
++ if (sys->mmal_desc.vout->info.subpicture_chromas != NULL && sys->x_desc.vout->info.subpicture_chromas != NULL) {
++ unsigned int n = 0;
++ // N^2 - fix if we ever care
++ for (const vlc_fourcc_t * p1 = sys->mmal_desc.vout->info.subpicture_chromas; *p1 != 0 && n != 15; ++p1) {
++ for (const vlc_fourcc_t * p2 = sys->x_desc.vout->info.subpicture_chromas; *p2 != 0; ++p2) {
++ if (*p1 == *p2) {
++ sys->subpicture_chromas[n++] = *p1;
++ break;
++ }
++ }
++ }
++ if (n != 0)
++ vd->info.subpicture_chromas = sys->subpicture_chromas;
++ }
++ }
++ vd->fmt = sys->cur_desc->vout->fmt;
++
++#if TRACE_ALL
++ {
++ char dbuf0[5];
++ msg_Dbg(vd, ">>> %s: (%s) %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d", __func__,
++ module_get_name(sys->cur_desc->vout->module, false),
++ str_fourcc(dbuf0, vd->fmt.i_chroma),
++ vd->fmt.i_width, vd->fmt.i_height,
++ vd->fmt.i_x_offset, vd->fmt.i_y_offset,
++ vd->fmt.i_visible_width, vd->fmt.i_visible_height,
++ vd->fmt.i_sar_num, vd->fmt.i_sar_den);
++ }
++#endif
++ return VLC_SUCCESS;
++
++fail:
++ CloseMmalX11(VLC_OBJECT(vd));
++ return ret == VLC_SUCCESS ? VLC_EGENERIC : ret;
++}
++
++
++
++
++vlc_module_begin()
++ set_shortname(N_("MMAL x11 splitter"))
++ set_description(N_("MMAL x11 splitter for Raspberry Pi"))
++ set_capability("vout display", 300) // Between GLES & GL
++ add_shortcut("mmal_x11")
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VOUT )
++ set_callbacks(OpenMmalX11, CloseMmalX11)
++vlc_module_end()
++
+--- a/modules/video_output/opengl/egl.c
++++ b/modules/video_output/opengl/egl.c
+@@ -43,6 +43,8 @@
+ # include "../android/utils.h"
+ #endif
+
++#define REQUIRE_DMA_BUF_IMPORT 1
++
+ typedef struct vlc_gl_sys_t
+ {
+ EGLDisplay display;
+@@ -355,6 +357,14 @@ static int Open (vlc_object_t *obj, cons
+ goto error;
+ }
+
++#if REQUIRE_DMA_BUF_IMPORT
++ if (!CheckToken(ext, "EGL_EXT_image_dma_buf_import"))
++ {
++ msg_Dbg(obj, "No dma_buf_import - fall back to X");
++ goto error;
++ }
++#endif
++
+ const EGLint conf_attr[] = {
+ EGL_RED_SIZE, 5,
+ EGL_GREEN_SIZE, 5,
+--- a/src/input/decoder.c
++++ b/src/input/decoder.c
+@@ -1995,6 +1995,7 @@ void input_DecoderDelete( decoder_t *p_d
+ vlc_mutex_lock( &p_owner->lock );
+ p_owner->b_waiting = false;
+ vlc_cond_signal( &p_owner->wait_request );
++ vlc_mutex_unlock( &p_owner->lock );
+
+ /* If the video output is paused or slow, or if the picture pool size was
+ * under-estimated (e.g. greedy video filter, buggy decoder...), the
+@@ -2005,7 +2006,6 @@ void input_DecoderDelete( decoder_t *p_d
+ * worker threads (if any) and the decoder thread to terminate. */
+ if( p_owner->p_vout != NULL )
+ vout_Cancel( p_owner->p_vout, true );
+- vlc_mutex_unlock( &p_owner->lock );
+
+ vlc_join( p_owner->thread, NULL );
+
+--- a/src/misc/fourcc.c
++++ b/src/misc/fourcc.c
+@@ -755,8 +755,13 @@ static const struct
+ { { VLC_CODEC_VDPAU_VIDEO_420, VLC_CODEC_VDPAU_VIDEO_422,
+ VLC_CODEC_VDPAU_VIDEO_444, VLC_CODEC_VDPAU_OUTPUT },
+ FAKE_FMT() },
+- { { VLC_CODEC_ANDROID_OPAQUE, VLC_CODEC_MMAL_OPAQUE,
+- VLC_CODEC_D3D9_OPAQUE, VLC_CODEC_D3D11_OPAQUE },
++ { { VLC_CODEC_ANDROID_OPAQUE }, FAKE_FMT() },
++ { { VLC_CODEC_MMAL_OPAQUE, VLC_CODEC_MMAL_ZC_SAND30 },
++ FAKE_FMT() },
++ { { VLC_CODEC_MMAL_ZC_I420, VLC_CODEC_MMAL_ZC_SAND8,
++ VLC_CODEC_MMAL_ZC_SAND10, VLC_CODEC_MMAL_ZC_RGB32 },
++ FAKE_FMT() },
++ { { VLC_CODEC_D3D9_OPAQUE, VLC_CODEC_D3D11_OPAQUE },
+ FAKE_FMT() },
+ { { VLC_CODEC_D3D11_OPAQUE_10B, VLC_CODEC_D3D9_OPAQUE_10B },
+ FAKE_FMT() },
+--- a/src/misc/picture.c
++++ b/src/misc/picture.c
+@@ -365,10 +365,30 @@ void picture_CopyProperties( picture_t *
+ p_dst->b_top_field_first = p_src->b_top_field_first;
+ }
+
++static inline bool is_zc_chroma(const vlc_fourcc_t i_chroma)
++{
++ return i_chroma == VLC_CODEC_MMAL_OPAQUE ||
++ i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_RGB32 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND8;
++}
++
+ void picture_CopyPixels( picture_t *p_dst, const picture_t *p_src )
+ {
+- for( int i = 0; i < p_src->i_planes ; i++ )
+- plane_CopyPixels( p_dst->p+i, p_src->p+i );
++ if( is_zc_chroma(p_src->format.i_chroma) )
++ {
++ assert(p_dst->i_planes == 0);
++ p_dst->i_planes = p_src->i_planes;
++ for( int i = 0; i < p_src->i_planes; i++ )
++ p_dst->p[i] = p_src->p[i];
++ }
++ else
++ {
++ for( int i = 0; i < p_src->i_planes; i++ )
++ plane_CopyPixels( p_dst->p+i, p_src->p+i );
++ }
+
+ assert( p_dst->context == NULL );
+
+--- a/src/video_output/video_output.c
++++ b/src/video_output/video_output.c
+@@ -964,6 +964,17 @@ static picture_t *ConvertRGB32AndBlend(v
+ return NULL;
+ }
+
++
++static inline bool is_zc_chroma(const vlc_fourcc_t i_chroma)
++{
++ return i_chroma == VLC_CODEC_MMAL_OPAQUE ||
++ i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_RGB32 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND8;
++}
++
+ static int ThreadDisplayRenderPicture(vout_thread_t *vout, bool is_forced)
+ {
+ vout_thread_sys_t *sys = vout->p;
+@@ -1098,7 +1109,7 @@ static int ThreadDisplayRenderPicture(vo
+ }
+
+ assert(vout_IsDisplayFiltered(vd) == !sys->display.use_dr);
+- if (sys->display.use_dr && !is_direct) {
++ if (sys->display.use_dr && !is_direct && !is_zc_chroma(todisplay->format.i_chroma)) {
+ picture_t *direct = NULL;
+ if (likely(vout->p->display_pool != NULL))
+ direct = picture_pool_Get(vout->p->display_pool);
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch
new file mode 100644
index 0000000..d8fc7fb
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch
@@ -0,0 +1,19 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/bin/vlc.c
++++ b/bin/vlc.c
+@@ -106,7 +106,10 @@ static void vlc_kill (void *data)
+ static void exit_timeout (int signum)
+ {
+ (void) signum;
+- signal (SIGINT, SIG_DFL);
++// This doesn't seem to be strong enough to reliably kill us if we fail to exit
++// in a timely fashion - so upgrade to _exit().
++// signal (SIGINT, SIG_DFL);
++ _exit(0);
+ }
+
+ /*****************************************************************************
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch
new file mode 100644
index 0000000..99fd03e
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch
@@ -0,0 +1,19 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/modules/video_chroma/chain.c
++++ b/modules/video_chroma/chain.c
+@@ -280,8 +280,9 @@ static int BuildTransformChain( filter_t
+ return VLC_SUCCESS;
+
+ /* Lets try resize+chroma first, then transform */
+- msg_Dbg( p_filter, "Trying to build chroma+resize" );
+- EsFormatMergeSize( &fmt_mid, &p_filter->fmt_out, &p_filter->fmt_in );
++ msg_Dbg( p_filter, "Trying to build chroma+resize, then transform" );
++ es_format_Copy( &fmt_mid, &p_filter->fmt_out );
++ video_format_TransformTo(&fmt_mid.video, p_filter->fmt_in.video.orientation);
+ i_ret = CreateChain( p_filter, &fmt_mid );
+ es_format_Clean( &fmt_mid );
+ if( i_ret == VLC_SUCCESS )
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch
new file mode 100644
index 0000000..64a2426
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch
@@ -0,0 +1,53 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/modules/hw/mmal/blend_rgba_neon.S
++++ b/modules/hw/mmal/blend_rgba_neon.S
+@@ -1,10 +1,10 @@
+- .syntax unified
+- .arm
+-// .thumb
+- .text
++#include "../../arm_neon/asm.S"
+ .align 16
+ .arch armv7-a
+- .fpu neon-vfpv4
++ .syntax unified
++#if HAVE_AS_FPU_DIRECTIVE
++ .fpu neon-vfpv4
++#endif
+
+ @ blend_rgbx_rgba_neon
+
+--- a/modules/hw/mmal/codec.c
++++ b/modules/hw/mmal/codec.c
+@@ -29,6 +29,7 @@
+ #include <stdatomic.h>
+
+ #include <vlc_common.h>
++#include <vlc_cpu.h>
+ #include <vlc_plugin.h>
+ #include <vlc_codec.h>
+ #include <vlc_filter.h>
+@@ -2311,6 +2312,9 @@ static int OpenBlendMmal(vlc_object_t *o
+ filter_t * const p_filter = (filter_t *)object;
+ const vlc_fourcc_t vfcc_dst = p_filter->fmt_out.video.i_chroma;
+
++ if (!vlc_CPU_ARM_NEON())
++ return VLC_EGENERIC;
++
+ if (!hw_mmal_chroma_is_mmal(vfcc_dst) ||
+ !hw_mmal_vzc_subpic_fmt_valid(&p_filter->fmt_in.video))
+ {
+@@ -2421,6 +2425,9 @@ static int OpenBlendNeon(vlc_object_t *o
+ MMAL_FOURCC_T mfcc_dst = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video);
+ blend_neon_fn * blend_fn = (blend_neon_fn *)0;
+
++ if (!vlc_CPU_ARM_NEON())
++ return VLC_EGENERIC;
++
+ // Non-alpha RGB only for dest
+ if (vfcc_dst != VLC_CODEC_RGB32)
+ return VLC_EGENERIC;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch
new file mode 100644
index 0000000..3dbd08d
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch
@@ -0,0 +1,26 @@
+From 048e4fdd08ac588feb27b03e3ec1824e24f77d62 Mon Sep 17 00:00:00 2001
+From: Khem Raj <raj.khem@gmail.com>
+Date: Sun, 5 Mar 2023 14:13:25 -0800
+Subject: [PATCH 3/3] configure: Disable incompatible-function-pointer-types
+ warning
+
+Upstream-Status: Pending
+Signed-off-by: Khem Raj <raj.khem@gmail.com>
+---
+ configure.ac | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/configure.ac
++++ b/configure.ac
+@@ -105,6 +105,11 @@ AC_SUBST([AM_CFLAGS], [-fcommon])
+ dnl Prevent clang from accepting unknown flags with a mere warning
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CFLAGS])
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CXXFLAGS])
++dnl disable clang from erroring on function pointer protype mismatch, vlc seems to rely on that
++dnl especially in modules/video_filter/deinterlace/algo_yadif.c how it interpolates 'filter` variable
++dnl between different functions yadif_filter_line_c_16bit() and yadif_filter_line_c()
++AX_APPEND_COMPILE_FLAGS([-Wno-error=incompatible-function-pointer-types -Wno-error=incompatible-function-pointer-types], [CFLAGS])
++AX_APPEND_COMPILE_FLAGS([-Wno-error=incompatible-function-pointer-types -Wno-error=incompatible-function-pointer-types], [CXXFLAGS])
+
+ dnl
+ dnl Check the operating system
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch
new file mode 100644
index 0000000..c526535
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch
@@ -0,0 +1,30 @@
+From 6fca76ebd76bf8fce9b111e31bda64015cdc770f Mon Sep 17 00:00:00 2001
+From: Johannes Kauffmann <johanneskauffmann@hotmail.com>
+Date: Mon, 11 Jul 2022 19:35:57 +0000
+Subject: [PATCH] demux: dash: include cstdint, needed for uint64_t
+
+Fixes #27077.
+
+Upstream-Status: Backport
+
+https://github.com/videolan/vlc/commit/6fca76ebd76bf8fce9b111e31bda64015cdc770f
+
+---
+ modules/demux/dash/mpd/TemplatedUri.hpp | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/modules/demux/dash/mpd/TemplatedUri.hpp b/modules/demux/dash/mpd/TemplatedUri.hpp
+index 1eeb70cbb6..7f7264a9c8 100644
+--- a/modules/demux/dash/mpd/TemplatedUri.hpp
++++ b/modules/demux/dash/mpd/TemplatedUri.hpp
+@@ -21,6 +21,7 @@
+ #ifndef TEMPLATEDURI_HPP
+ #define TEMPLATEDURI_HPP
+
++#include <cstdint>
+ #include <string>
+
+ namespace dash
+--
+2.34.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch
new file mode 100644
index 0000000..e8990fc
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch
@@ -0,0 +1,236 @@
+* luaL_checkint and luaL_optint were deprecated in lua 5.3
+* replacement functions are luaL_checkinteger and luaL_optinteger
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+Signed-off-by: Tim Orling <TicoTimo@gmail.com>
+
+--- a/modules/lua/demux.c
++++ b/modules/lua/demux.c
+@@ -52,7 +52,7 @@ struct vlclua_playlist
+ static int vlclua_demux_peek( lua_State *L )
+ {
+ stream_t *s = (stream_t *)vlclua_get_this(L);
+- int n = luaL_checkint( L, 1 );
++ int n = luaL_checkinteger( L, 1 );
+ const uint8_t *p_peek;
+
+ ssize_t val = vlc_stream_Peek(s->p_source, &p_peek, n);
+@@ -66,7 +66,7 @@ static int vlclua_demux_peek( lua_State
+ static int vlclua_demux_read( lua_State *L )
+ {
+ stream_t *s = (stream_t *)vlclua_get_this(L);
+- int n = luaL_checkint( L, 1 );
++ int n = luaL_checkinteger( L, 1 );
+ char *buf = malloc(n);
+
+ if (buf != NULL)
+--- a/modules/lua/libs/net.c
++++ b/modules/lua/libs/net.c
+@@ -179,7 +179,7 @@ static int vlclua_net_listen_tcp( lua_St
+ {
+ vlc_object_t *p_this = vlclua_get_this( L );
+ const char *psz_host = luaL_checkstring( L, 1 );
+- int i_port = luaL_checkint( L, 2 );
++ int i_port = luaL_checkinteger( L, 2 );
+ int *pi_fd = net_ListenTCP( p_this, psz_host, i_port );
+ if( pi_fd == NULL )
+ return luaL_error( L, "Cannot listen on %s:%d", psz_host, i_port );
+@@ -251,7 +251,7 @@ static int vlclua_net_connect_tcp( lua_S
+ {
+ vlc_object_t *p_this = vlclua_get_this( L );
+ const char *psz_host = luaL_checkstring( L, 1 );
+- int i_port = luaL_checkint( L, 2 );
++ int i_port = luaL_checkinteger( L, 2 );
+ int i_fd = net_ConnectTCP( p_this, psz_host, i_port );
+ lua_pushinteger( L, vlclua_fd_map_safe( L, i_fd ) );
+ return 1;
+@@ -259,14 +259,14 @@ static int vlclua_net_connect_tcp( lua_S
+
+ static int vlclua_net_close( lua_State *L )
+ {
+- int i_fd = luaL_checkint( L, 1 );
++ int i_fd = luaL_checkinteger( L, 1 );
+ vlclua_fd_unmap_safe( L, i_fd );
+ return 0;
+ }
+
+ static int vlclua_net_send( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len;
+ const char *psz_buffer = luaL_checklstring( L, 2, &i_len );
+
+@@ -278,7 +278,7 @@ static int vlclua_net_send( lua_State *L
+
+ static int vlclua_net_recv( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len = (size_t)luaL_optinteger( L, 2, 1 );
+ char psz_buffer[i_len];
+
+@@ -312,7 +312,7 @@ static int vlclua_net_poll( lua_State *L
+ lua_pushnil( L );
+ for( int i = 0; lua_next( L, 1 ); i++ )
+ {
+- luafds[i] = luaL_checkint( L, -2 );
++ luafds[i] = luaL_checkinteger( L, -2 );
+ p_fds[i].fd = vlclua_fd_get( L, luafds[i] );
+ p_fds[i].events = luaL_checkinteger( L, -1 );
+ p_fds[i].events &= POLLIN | POLLOUT | POLLPRI;
+@@ -360,7 +360,7 @@ static int vlclua_fd_open( lua_State *L
+ #ifndef _WIN32
+ static int vlclua_fd_write( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len;
+ const char *psz_buffer = luaL_checklstring( L, 2, &i_len );
+
+@@ -371,7 +371,7 @@ static int vlclua_fd_write( lua_State *L
+
+ static int vlclua_fd_read( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len = (size_t)luaL_optinteger( L, 2, 1 );
+ char psz_buffer[i_len];
+
+--- a/modules/lua/libs/osd.c
++++ b/modules/lua/libs/osd.c
+@@ -154,7 +154,7 @@ static int vlc_osd_slider_type_from_stri
+
+ static int vlclua_osd_slider( lua_State *L )
+ {
+- int i_position = luaL_checkint( L, 1 );
++ int i_position = luaL_checkinteger( L, 1 );
+ const char *psz_type = luaL_checkstring( L, 2 );
+ int i_type = vlc_osd_slider_type_from_string( psz_type );
+ int i_chan = (int)luaL_optinteger( L, 3, VOUT_SPU_CHANNEL_OSD );
+@@ -198,7 +198,7 @@ static int vlclua_spu_channel_register(
+
+ static int vlclua_spu_channel_clear( lua_State *L )
+ {
+- int i_chan = luaL_checkint( L, 1 );
++ int i_chan = luaL_checkinteger( L, 1 );
+ input_thread_t *p_input = vlclua_get_input_internal( L );
+ if( !p_input )
+ return luaL_error( L, "Unable to find input." );
+--- a/modules/lua/libs/playlist.c
++++ b/modules/lua/libs/playlist.c
+@@ -69,7 +69,7 @@ static int vlclua_playlist_next( lua_Sta
+
+ static int vlclua_playlist_skip( lua_State * L )
+ {
+- int i_skip = luaL_checkint( L, 1 );
++ int i_skip = luaL_checkinteger( L, 1 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+ playlist_Skip( p_playlist, i_skip );
+ return 0;
+@@ -127,7 +127,7 @@ static int vlclua_playlist_random( lua_S
+
+ static int vlclua_playlist_gotoitem( lua_State * L )
+ {
+- int i_id = luaL_checkint( L, 1 );
++ int i_id = luaL_checkinteger( L, 1 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+ PL_LOCK;
+ playlist_ViewPlay( p_playlist, NULL,
+@@ -138,7 +138,7 @@ static int vlclua_playlist_gotoitem( lua
+
+ static int vlclua_playlist_delete( lua_State * L )
+ {
+- int i_id = luaL_checkint( L, 1 );
++ int i_id = luaL_checkinteger( L, 1 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+
+ PL_LOCK;
+@@ -152,8 +152,8 @@ static int vlclua_playlist_delete( lua_S
+
+ static int vlclua_playlist_move( lua_State * L )
+ {
+- int i_item = luaL_checkint( L, 1 );
+- int i_target = luaL_checkint( L, 2 );
++ int i_item = luaL_checkinteger( L, 1 );
++ int i_target = luaL_checkinteger( L, 2 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+ PL_LOCK;
+ playlist_item_t *p_item = playlist_ItemGetById( p_playlist, i_item );
+--- a/modules/lua/libs/stream.c
++++ b/modules/lua/libs/stream.c
+@@ -123,7 +123,7 @@ static int vlclua_stream_read( lua_State
+ {
+ int i_read;
+ stream_t **pp_stream = (stream_t **)luaL_checkudata( L, 1, "stream" );
+- int n = luaL_checkint( L, 2 );
++ int n = luaL_checkinteger( L, 2 );
+ uint8_t *p_read = malloc( n );
+ if( !p_read ) return vlclua_error( L );
+
+--- a/modules/lua/libs/volume.c
++++ b/modules/lua/libs/volume.c
+@@ -48,7 +48,7 @@
+ static int vlclua_volume_set( lua_State *L )
+ {
+ playlist_t *p_this = vlclua_get_playlist_internal( L );
+- int i_volume = luaL_checkint( L, 1 );
++ int i_volume = luaL_checkinteger( L, 1 );
+ if( i_volume < 0 )
+ i_volume = 0;
+ int i_ret = playlist_VolumeSet( p_this, i_volume/(float)AOUT_VOLUME_DEFAULT );
+--- a/modules/lua/libs/dialog.c
++++ b/modules/lua/libs/dialog.c
+@@ -382,7 +382,7 @@ static int lua_GetDialogUpdate( lua_Stat
+ /* Read entry in the Lua registry */
+ lua_pushlightuserdata( L, (void*) &key_update );
+ lua_gettable( L, LUA_REGISTRYINDEX );
+- return luaL_checkint( L, -1 );
++ return luaL_checkinteger( L, -1 );
+ }
+
+ /** Manually update a dialog
+@@ -573,22 +573,22 @@ static int vlclua_create_widget_inner( l
+
+ /* Set common arguments: col, row, hspan, vspan, width, height */
+ if( lua_isnumber( L, arg ) )
+- p_widget->i_column = luaL_checkint( L, arg );
++ p_widget->i_column = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_row = luaL_checkint( L, arg );
++ p_widget->i_row = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_horiz_span = luaL_checkint( L, arg );
++ p_widget->i_horiz_span = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_vert_span = luaL_checkint( L, arg );
++ p_widget->i_vert_span = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_width = luaL_checkint( L, arg );
++ p_widget->i_width = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_height = luaL_checkint( L, arg );
++ p_widget->i_height = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+
+ end_of_args:
+--- a/modules/lua/libs/io.c
++++ b/modules/lua/libs/io.c
+@@ -139,7 +139,7 @@ static int vlclua_io_file_seek( lua_Stat
+ const char* psz_mode = luaL_optstring( L, 2, NULL );
+ if ( psz_mode != NULL )
+ {
+- long i_offset = luaL_optlong( L, 3, 0 );
++ long i_offset = (long)luaL_optinteger( L, 3, 0 );
+ int i_mode;
+ if ( !strcmp( psz_mode, "set" ) )
+ i_mode = SEEK_SET;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch
new file mode 100644
index 0000000..bfabf21
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch
@@ -0,0 +1,33 @@
+From d0a7ba506fd302ad195f79f287b5a5a154ac02a3 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Sun, 4 Dec 2022 16:09:51 -0600
+Subject: [PATCH] tremor provides libvorbisidec, use it instead of libvorbisdec
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+THIS PATCHES HAS BEEN REIMPLEMENTED INORDER TO APPLY PROPERLY.
+
+Signed-off-by: Tim Orling <TicoTimo@gmail.com>
+---
+ modules/codec/Makefile.am | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/modules/codec/Makefile.am b/modules/codec/Makefile.am
+index 3dadf1119..8b6189e92 100644
+--- a/modules/codec/Makefile.am
++++ b/modules/codec/Makefile.am
+@@ -324,7 +324,7 @@ codec_LTLIBRARIES += $(LTLIBdaala)
+ libtremor_plugin_la_SOURCES = codec/vorbis.c
+ libtremor_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -DMODULE_NAME_IS_tremor
+ libtremor_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(codecdir)'
+-libtremor_plugin_la_LIBADD = -lvorbisdec -logg
++libtremor_plugin_la_LIBADD = -lvorbisidec -logg
+ EXTRA_LTLIBRARIES += libtremor_plugin.la
+ codec_LTLIBRARIES += $(LTLIBtremor)
+
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch
new file mode 100644
index 0000000..d676be3
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch
@@ -0,0 +1,124 @@
+From ddc2ea76058466b45a1acf37bed0d794cd3112a3 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 19:04:42 -0600
+Subject: [PATCH] configure.ac: setup for OE usage
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+Need to use userland graphics libraries package files as it's best
+to not assume /opt/vc is where all libs and headers are installed per
+distro. Also, needed to include $BCMHOST_MMAL_LIBS variable as
+AC_CHECK_LIB(bcm_host) fails to find `vc_tv_unregister_callback_full`.
+Adding $BCMHOST_MMAL_LIBS uses all libs inside
+bcm_host.pc, mmal.pc, vcsm.pc, openmaxil.pc files when checking
+for `vc_tv_unregister_callback_full` function.
+
+Supposed to change linked version to opengl to GLESv2
+
+Ensure correct package config file is used for:
+* opencv
+* freerdp
+
+Adds Workaround for modules/codec/omxil/omxil_core.h
+ multiple definition of `pf_enable_graphic_buffers'
+ multiple definition of `pf_get_graphic_buffer_usage'
+ multiple definition of `pf_get_hal_format'
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ configure.ac | 34 ++++++++++++++++++++++------------
+ 1 file changed, 22 insertions(+), 12 deletions(-)
+
+diff --git a/configure.ac b/configure.ac
+index a72dca0b6..5b8585a26 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -95,6 +95,13 @@ AS_IF([test -n "${with_binary_version}"],[
+ [Binary specific version])
+ ])
+
++# Workaround for modules/codec/omxil/omxil_core.h
++# multiple definition of `pf_enable_graphic_buffers'
++# multiple definition of `pf_get_graphic_buffer_usage'
++# multiple definition of `pf_get_hal_format'
++AC_SUBST([AM_CXXFLAGS], [-fcommon])
++AC_SUBST([AM_CFLAGS], [-fcommon])
++
+ dnl Prevent clang from accepting unknown flags with a mere warning
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CFLAGS])
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CXXFLAGS])
+@@ -1900,7 +1907,7 @@ PKG_ENABLE_MODULES_VLC([BLURAY], [libbluray], [libbluray >= 0.6.2], (libbluray f
+ dnl
+ dnl OpenCV wrapper and example filters
+ dnl
+-PKG_ENABLE_MODULES_VLC([OPENCV], [opencv_example opencv_wrapper], [opencv > 2.0], (OpenCV (computer vision) filter), [auto])
++PKG_ENABLE_MODULES_VLC([OPENCV], [opencv_example opencv_wrapper], [opencv4 >= 2.0], (OpenCV (computer vision) filter), [auto])
+
+
+ dnl
+@@ -2077,7 +2084,7 @@ PKG_ENABLE_MODULES_VLC([VNC], [vnc], [libvncclient >= 0.9.9], (VNC/rfb client su
+
+ dnl RDP/Remote Desktop access module
+ dnl
+-PKG_ENABLE_MODULES_VLC([FREERDP], [rdp], [freerdp >= 1.0.1], (RDP/Remote Desktop client support) )
++PKG_ENABLE_MODULES_VLC([FREERDP], [rdp], [freerdp2 >= 1.0.1], (RDP/Remote Desktop client support) )
+
+ dnl
+ dnl Real RTSP plugin
+@@ -3089,14 +3096,14 @@ PKG_CHECK_MODULES([GL], [gl], [
+ #ifdef _WIN32
+ # include <GL/glew.h>
+ #endif
+-#include <GL/gl.h>
++#include <GLES2/gl2.h>
+ ]], [
+ [int t0 = GL_TEXTURE0;]])
+ ], [
+ GL_CFLAGS=""
+ have_gl="yes"
+ AS_IF([test "${SYS}" != "mingw32"], [
+- GL_LIBS="-lGL"
++ GL_LIBS="-lGLESv2"
+ ], [
+ GL_LIBS="-lopengl32"
+ ])
+@@ -3483,15 +3490,14 @@ AC_ARG_ENABLE(mmal_avcodec,
+ [Use MMAL enabled avcodec libs (default disable)]))
+ if test "${enable_mmal}" != "no"; then
+ VLC_SAVE_FLAGS
+- LDFLAGS="${LDFLAGS} -L/opt/vc/lib -lvchostif"
+- CPPFLAGS="${CPPFLAGS} -isystem /opt/vc/include -isystem /opt/vc/include/interface/vcos/pthreads -isystem /opt/vc/include/interface/vmcs_host/linux"
+- AC_CHECK_HEADERS(interface/mmal/mmal.h,
+- [ AC_CHECK_LIB(bcm_host, vc_tv_unregister_callback_full, [
++ PKG_CHECK_MODULES(BCMHOST_MMAL, [bcm_host mmal vcsm openmaxil egl], [
++ HAVE_MMAL=yes
++ AC_CHECK_HEADERS(interface/mmal/mmal.h,
++ [ AC_CHECK_LIB(bcm_host $BCMHOST_MMAL_LIBS, vc_tv_unregister_callback_full, [
+ have_mmal="yes"
+- VLC_ADD_PLUGIN([mmal])
+- VLC_ADD_LDFLAGS([mmal],[ -L/opt/vc/lib ])
+- VLC_ADD_CFLAGS([mmal],[ -isystem /opt/vc/include -isystem /opt/vc/include/interface/vcos/pthreads -isystem /opt/vc/include/interface/vmcs_host/linux ])
+- VLC_ADD_LIBS([mmal],[ -lbcm_host -lmmal -lmmal_core -lmmal_components -lmmal_util -lvchostif -lvchiq_arm -lvcsm ]) ], [
++ VLC_ADD_PLUGIN([bcm_host mmal vcsm openmaxil egl])
++ VLC_ADD_CFLAGS([bcm_host mmal vcsm openmaxil egl],[$BCMHOST_MMAL_CFLAGS])
++ VLC_ADD_LIBS([bcm_host mmal vcsm openmaxil egl],[$BCMHOST_MMAL_LIBS -lmmal_components]) ], [
+ AS_IF([test "${enable_mmal}" = "yes"],
+ [ AC_MSG_ERROR([Cannot find bcm library...]) ],
+ [ AC_MSG_WARN([Cannot find bcm library...]) ])
+@@ -3500,6 +3506,10 @@ if test "${enable_mmal}" != "no"; then
+ ] , [ AS_IF([test "${enable_mmal}" = "yes"],
+ [ AC_MSG_ERROR([Cannot find development headers for mmal...]) ],
+ [ AC_MSG_WARN([Cannot find development headers for mmal...]) ]) ])
++ ],:[
++ AC_MSG_WARN([${BCMHOST_PKG_ERRORS}: userland graphics not available.])
++ HAVE_MMAL=NO
++ ])
+ VLC_RESTORE_FLAGS
+ fi
+ AM_CONDITIONAL([HAVE_MMAL], [test "${have_mmal}" = "yes"])
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch
new file mode 100644
index 0000000..ab72b4f
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch
@@ -0,0 +1,61 @@
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 07 Jan 2022 07:01:47 PM CST
+Subject: [PATCH] Fix EGL macro undeclared and EGLImageKHR
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+* Fixes compiler issues related to EGL macro constant/enum value type not being defined
+* Updates EGLImage to EGLImageKHR
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+diff --git a/modules/hw/mmal/converter_mmal.c b/modules/hw/mmal/converter_mmal.c
+index f31cb81d8..426af668b 100644
+--- a/modules/hw/mmal/converter_mmal.c
++++ b/modules/hw/mmal/converter_mmal.c
+@@ -28,6 +28,34 @@
+
+ #define TRACE_ALL 0
+
++// Pass Yocto related build errors
++#define EGL_LINUX_DMA_BUF_EXT 0x3270
++#define EGL_LINUX_DRM_FOURCC_EXT 0x3271
++#define EGL_DMA_BUF_PLANE0_FD_EXT 0x3272
++#define EGL_DMA_BUF_PLANE0_OFFSET_EXT 0x3273
++#define EGL_DMA_BUF_PLANE0_PITCH_EXT 0x3274
++#define EGL_DMA_BUF_PLANE1_FD_EXT 0x3275
++#define EGL_DMA_BUF_PLANE1_OFFSET_EXT 0x3276
++#define EGL_DMA_BUF_PLANE1_PITCH_EXT 0x3277
++#define EGL_DMA_BUF_PLANE2_FD_EXT 0x3278
++#define EGL_DMA_BUF_PLANE2_OFFSET_EXT 0x3279
++#define EGL_DMA_BUF_PLANE2_PITCH_EXT 0x327A
++#define EGL_YUV_COLOR_SPACE_HINT_EXT 0x327B
++#define EGL_SAMPLE_RANGE_HINT_EXT 0x327C
++#define EGL_YUV_CHROMA_HORIZONTAL_SITING_HINT_EXT 0x327D
++#define EGL_YUV_CHROMA_VERTICAL_SITING_HINT_EXT 0x327E
++#define EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT 0x3443
++#define EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT 0x3444
++#define EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT 0x3445
++#define EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT 0x3446
++#define EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT 0x3447
++#define EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT 0x3448
++#define EGL_DMA_BUF_PLANE3_FD_EXT 0x3440
++#define EGL_DMA_BUF_PLANE3_OFFSET_EXT 0x3441
++#define EGL_DMA_BUF_PLANE3_PITCH_EXT 0x3442
++#define EGL_DMA_BUF_PLANE3_MODIFIER_LO_EXT 0x3449
++#define EGL_DMA_BUF_PLANE3_MODIFIER_HI_EXT 0x344A
++
+ typedef struct mmal_gl_converter_s
+ {
+ EGLint drm_fourcc;
+@@ -199,7 +227,7 @@ static tex_context_t * get_tex_context(const opengl_tex_converter_t * const tc,
+
+ *a = EGL_NONE;
+
+- const EGLImage image = tc->gl->egl.createImageKHR(tc->gl, EGL_LINUX_DMA_BUF_EXT, NULL, attribs);
++ const EGLImageKHR image = tc->gl->egl.createImageKHR(tc->gl, EGL_LINUX_DMA_BUF_EXT, NULL, attribs);
+ if (!image) {
+ msg_Err(tc, "Failed to import fd %d: Err=%#x", fd, tc->vt->GetError());
+ goto fail;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch
new file mode 100644
index 0000000..a2dba50
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch
@@ -0,0 +1,43 @@
+From 85f6603aca1d174848b42e696a4cff8af57613d6 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 23:38:36 -0600
+Subject: [PATCH] codec: omxil_core replace /opt/vc path with /usr/lib
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original VLC and applies patches to enable
+raspiberry pi support.
+
+Configures omxil_core.c for OE usages as libbcm_host.so
+and libopenmaxil.so are located in a different location.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/codec/omxil/omxil_core.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/modules/codec/omxil/omxil_core.c b/modules/codec/omxil/omxil_core.c
+index 5098f517a..5922d9034 100644
+--- a/modules/codec/omxil/omxil_core.c
++++ b/modules/codec/omxil/omxil_core.c
+@@ -56,7 +56,7 @@ static const char *ppsz_dll_list[] =
+ #if defined(USE_IOMX)
+ "libiomx.so", /* Not used when using IOMX, the lib should already be loaded */
+ #elif defined(RPI_OMX)
+- "/opt/vc/lib/libopenmaxil.so", /* Broadcom IL core */
++ "/usr/lib/libopenmaxil.so", /* Broadcom IL core */
+ #elif 1
+ "libOMX_Core.so", /* TI OMAP IL core */
+ "libOmxCore.so", /* Qualcomm IL core */
+@@ -70,7 +70,7 @@ static const char *ppsz_dll_list[] =
+ #ifdef RPI_OMX
+ static const char *ppsz_extra_dll_list[] =
+ {
+- "/opt/vc/lib/libbcm_host.so", /* Broadcom host library */
++ "/usr/lib/libbcm_host.so", /* Broadcom host library */
+ 0
+ };
+ #endif
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch
new file mode 100644
index 0000000..8016ab3
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch
@@ -0,0 +1,60 @@
+From 377a67af6c3f7c38f6f7ba24f042ba1a6cfd3f24 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 00:21:43 -0600
+Subject: [PATCH] use GLESv2 headers over GL headers
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+We utilize GLESv2 during compilation. Patches ensures
+we utilize headers for it.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/video_output/opengl/converter.h | 12 +++---------
+ modules/visualization/glspectrum.c | 4 +++-
+ 2 files changed, 6 insertions(+), 10 deletions(-)
+
+diff --git a/modules/video_output/opengl/converter.h b/modules/video_output/opengl/converter.h
+index 7000e1f38..a3fe32671 100644
+--- a/modules/video_output/opengl/converter.h
++++ b/modules/video_output/opengl/converter.h
+@@ -41,15 +41,9 @@
+ # include <OpenGLES/ES2/glext.h>
+ # endif
+ #else /* !defined (__APPLE__) */
+-# if defined (USE_OPENGL_ES2)
+-# include <GLES2/gl2.h>
+-# include <GLES2/gl2ext.h>
+-# else
+-# ifdef _WIN32
+-# include <GL/glew.h>
+-# endif
+-# include <GL/gl.h>
+-# endif
++#define USE_OPENGL_ES2
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
+ #endif
+
+ #define VLCGL_PICTURE_MAX 128
+diff --git a/modules/visualization/glspectrum.c b/modules/visualization/glspectrum.c
+index 06f8d1bdf..470080b1a 100644
+--- a/modules/visualization/glspectrum.c
++++ b/modules/visualization/glspectrum.c
+@@ -37,7 +37,9 @@
+ #ifdef __APPLE__
+ # include <OpenGL/gl.h>
+ #else
+-# include <GL/gl.h>
++#define USE_OPENGL_ES2
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
+ #endif
+
+ #include <math.h>
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch
new file mode 100644
index 0000000..7cf210b
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch
@@ -0,0 +1,149 @@
+From 5f1bb5889d838719e381350b25c00ef3a75d0e02 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 01:07:55 -0600
+Subject: [PATCH] modules: remove glspectrum usage
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+The glspectrum modules requries OpenGL
+while we only want to utilize GLESv2.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/Makefile.in | 24 ------------------------
+ modules/visualization/Makefile.am | 10 ----------
+ 2 files changed, 34 deletions(-)
+
+diff --git a/modules/Makefile.in b/modules/Makefile.in
+index bde45db53..c9c4342ad 100644
+--- a/modules/Makefile.in
++++ b/modules/Makefile.in
+@@ -481,7 +481,6 @@ TESTS = hpack_test$(EXEEXT) hpackenc_test$(EXEEXT) \
+ @HAVE_WIN32_FALSE@am__append_247 = $(X_LIBS) $(X_PRE_LIBS) -lX11
+ @HAVE_DARWIN_FALSE@@HAVE_WIN32_FALSE@am__append_248 = $(X_LIBS) $(X_PRE_LIBS) -lX11
+ @HAVE_EVAS_TRUE@am__append_249 = libevas_plugin.la
+-@HAVE_GL_TRUE@am__append_250 = libglspectrum_plugin.la
+ @ENABLE_SOUT_TRUE@@HAVE_GCRYPT_TRUE@am__append_251 = libaccess_output_livehttp_plugin.la
+ @ENABLE_SOUT_TRUE@am__append_252 = libaccess_output_shout_plugin.la \
+ @ENABLE_SOUT_TRUE@ libaccess_output_srt_plugin.la \
+@@ -2028,13 +2027,7 @@ libgles2_plugin_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+ $(libgles2_plugin_la_CFLAGS) $(CFLAGS) \
+ $(libgles2_plugin_la_LDFLAGS) $(LDFLAGS) -o $@
+-libglspectrum_plugin_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1)
+-am_libglspectrum_plugin_la_OBJECTS = visualization/glspectrum.lo \
+- visualization/visual/fft.lo visualization/visual/window.lo
+-libglspectrum_plugin_la_OBJECTS = \
+- $(am_libglspectrum_plugin_la_OBJECTS)
+-@HAVE_GL_TRUE@am_libglspectrum_plugin_la_rpath = -rpath $(visudir)
+ libglwin32_plugin_la_DEPENDENCIES = libchroma_copy.la \
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_5)
+ am__objects_23 = \
+@@ -6507,7 +6500,6 @@ am__depfiles_remade = \
+ video_splitter/$(DEPDIR)/clone.Plo \
+ video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo \
+ video_splitter/$(DEPDIR)/wall.Plo \
+- visualization/$(DEPDIR)/glspectrum.Plo \
+ visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo \
+ visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo \
+ visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo \
+@@ -6731,7 +6723,6 @@ SOURCES = $(liba52_plugin_la_SOURCES) $(libaa_plugin_la_SOURCES) \
+ $(libglconv_vaapi_x11_plugin_la_SOURCES) \
+ $(libglconv_vdpau_plugin_la_SOURCES) \
+ $(libgles2_plugin_la_SOURCES) \
+- $(libglspectrum_plugin_la_SOURCES) \
+ $(libglwin32_plugin_la_SOURCES) $(libglx_plugin_la_SOURCES) \
+ $(libgme_plugin_la_SOURCES) $(libgnutls_plugin_la_SOURCES) \
+ $(libgoom_plugin_la_SOURCES) $(libgradfun_plugin_la_SOURCES) \
+@@ -7130,7 +7121,6 @@ DIST_SOURCES = $(liba52_plugin_la_SOURCES) $(libaa_plugin_la_SOURCES) \
+ $(libglconv_vaapi_x11_plugin_la_SOURCES) \
+ $(libglconv_vdpau_plugin_la_SOURCES) \
+ $(libgles2_plugin_la_SOURCES) \
+- $(libglspectrum_plugin_la_SOURCES) \
+ $(libglwin32_plugin_la_SOURCES) $(libglx_plugin_la_SOURCES) \
+ $(libgme_plugin_la_SOURCES) $(libgnutls_plugin_la_SOURCES) \
+ $(libgoom_plugin_la_SOURCES) $(libgradfun_plugin_la_SOURCES) \
+@@ -12696,13 +12686,6 @@ libevent_thread_la_LDFLAGS = -static
+ visudir = $(pluginsdir)/visualization
+ visu_LTLIBRARIES = $(am__append_250) $(LTLIBgoom) $(LTLIBprojectm) \
+ libvisual_plugin.la $(LTLIBvsxu)
+-libglspectrum_plugin_la_SOURCES = \
+- visualization/glspectrum.c \
+- visualization/visual/fft.c visualization/visual/fft.h \
+- visualization/visual/window.c visualization/visual/window.h \
+- visualization/visual/window_presets.h
+-
+-libglspectrum_plugin_la_LIBADD = $(GL_LIBS) $(LIBM)
+ libgoom_plugin_la_SOURCES = visualization/goom.c
+ libgoom_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(GOOM_CFLAGS)
+ libgoom_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(visudir)'
+@@ -15715,8 +15698,6 @@ visualization/$(am__dirstamp):
+ visualization/$(DEPDIR)/$(am__dirstamp):
+ @$(MKDIR_P) visualization/$(DEPDIR)
+ @: > visualization/$(DEPDIR)/$(am__dirstamp)
+-visualization/glspectrum.lo: visualization/$(am__dirstamp) \
+- visualization/$(DEPDIR)/$(am__dirstamp)
+ visualization/visual/$(am__dirstamp):
+ @$(MKDIR_P) visualization/visual
+ @: > visualization/visual/$(am__dirstamp)
+@@ -15728,8 +15709,6 @@ visualization/visual/fft.lo: visualization/visual/$(am__dirstamp) \
+ visualization/visual/window.lo: visualization/visual/$(am__dirstamp) \
+ visualization/visual/$(DEPDIR)/$(am__dirstamp)
+
+-libglspectrum_plugin.la: $(libglspectrum_plugin_la_OBJECTS) $(libglspectrum_plugin_la_DEPENDENCIES) $(EXTRA_libglspectrum_plugin_la_DEPENDENCIES)
+- $(AM_V_CCLD)$(LINK) $(am_libglspectrum_plugin_la_rpath) $(libglspectrum_plugin_la_OBJECTS) $(libglspectrum_plugin_la_LIBADD) $(LIBS)
+ video_output/opengl/libglwin32_plugin_la-vout_helper.lo: \
+ video_output/opengl/$(am__dirstamp) \
+ video_output/opengl/$(DEPDIR)/$(am__dirstamp)
+@@ -21420,7 +21399,6 @@ distclean-compile:
+ @AMDEP_TRUE@@am__include@ @am__quote@video_splitter/$(DEPDIR)/clone.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@video_splitter/$(DEPDIR)/wall.Plo@am__quote@ # am--include-marker
+-@AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/glspectrum.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo@am__quote@ # am--include-marker
+@@ -30324,7 +30302,6 @@ distclean: distclean-recursive
+ -rm -f video_splitter/$(DEPDIR)/clone.Plo
+ -rm -f video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo
+ -rm -f video_splitter/$(DEPDIR)/wall.Plo
+- -rm -f visualization/$(DEPDIR)/glspectrum.Plo
+ -rm -f visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo
+ -rm -f visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo
+ -rm -f visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo
+@@ -31722,7 +31699,6 @@ maintainer-clean: maintainer-clean-recursive
+ -rm -f video_splitter/$(DEPDIR)/clone.Plo
+ -rm -f video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo
+ -rm -f video_splitter/$(DEPDIR)/wall.Plo
+- -rm -f visualization/$(DEPDIR)/glspectrum.Plo
+ -rm -f visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo
+ -rm -f visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo
+ -rm -f visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo
+diff --git a/modules/visualization/Makefile.am b/modules/visualization/Makefile.am
+index 10619e030..aafc97f87 100644
+--- a/modules/visualization/Makefile.am
++++ b/modules/visualization/Makefile.am
+@@ -1,16 +1,6 @@
+ visudir = $(pluginsdir)/visualization
+ visu_LTLIBRARIES =
+
+-libglspectrum_plugin_la_SOURCES = \
+- visualization/glspectrum.c \
+- visualization/visual/fft.c visualization/visual/fft.h \
+- visualization/visual/window.c visualization/visual/window.h \
+- visualization/visual/window_presets.h
+-libglspectrum_plugin_la_LIBADD = $(GL_LIBS) $(LIBM)
+-if HAVE_GL
+-visu_LTLIBRARIES += libglspectrum_plugin.la
+-endif
+-
+ libgoom_plugin_la_SOURCES = visualization/goom.c
+ libgoom_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(GOOM_CFLAGS)
+ libgoom_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(visudir)'
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch
new file mode 100644
index 0000000..e680c88
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch
@@ -0,0 +1,43 @@
+From fd4d233757cc46cd89f68b45ec4b059940dd84ae Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 19:58:11 -0600
+Subject: [PATCH] codec: omxil_core.h fix multiple definition of
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+Issue occurs during compilation as
+* pf_enable_graphic_buffers
+* pf_get_graphic_buffer_usage
+* pf_get_hal_format
+
+Apears to be defined multiple times as the omxil_core.h
+is included in multiple files.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/codec/omxil/omxil_core.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/modules/codec/omxil/omxil_core.h b/modules/codec/omxil/omxil_core.h
+index ac3db510b..f6e42f5ed 100644
+--- a/modules/codec/omxil/omxil_core.h
++++ b/modules/codec/omxil/omxil_core.h
+@@ -34,9 +34,9 @@ extern OMX_ERRORTYPE (*pf_component_enum)(OMX_STRING, OMX_U32, OMX_U32);
+ extern OMX_ERRORTYPE (*pf_get_roles_of_component)(OMX_STRING, OMX_U32 *, OMX_U8 **);
+
+ /* Extra IOMX android functions. Can be NULL if we don't link with libiomx */
+-OMX_ERRORTYPE (*pf_enable_graphic_buffers)(OMX_HANDLETYPE, OMX_U32, OMX_BOOL);
+-OMX_ERRORTYPE (*pf_get_graphic_buffer_usage)(OMX_HANDLETYPE, OMX_U32, OMX_U32*);
+-OMX_ERRORTYPE (*pf_get_hal_format) (const char *, int *);
++extern OMX_ERRORTYPE (*pf_enable_graphic_buffers)(OMX_HANDLETYPE, OMX_U32, OMX_BOOL);
++extern OMX_ERRORTYPE (*pf_get_graphic_buffer_usage)(OMX_HANDLETYPE, OMX_U32, OMX_U32*);
++extern OMX_ERRORTYPE (*pf_get_hal_format) (const char *, int *);
+
+ int InitOmxCore(vlc_object_t *p_this);
+ void DeinitOmxCore(void);
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch
new file mode 100644
index 0000000..a0487fa
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch
@@ -0,0 +1,36 @@
+From 34e4f4dad923095989ccb0ab8efb883c592bdbfd Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 20:04:27 -0600
+Subject: [PATCH] remove xorg related link libs
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+If x11 isn't defined in DISTRO_FEATURES
+required xorg related libs are not included
+in recipe-sysroot resulting in compilation
+failure.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/hw/mmal/Makefile.am | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/modules/hw/mmal/Makefile.am b/modules/hw/mmal/Makefile.am
+index 4abe68e2e..86dad2c2d 100644
+--- a/modules/hw/mmal/Makefile.am
++++ b/modules/hw/mmal/Makefile.am
+@@ -8,7 +8,7 @@ libmmal_vout_plugin_la_SOURCES = vout.c mmal_cma.c mmal_picture.c subpic.c\
+ mmal_cma.h mmal_picture.h subpic.h transform_ops.h\
+ mmal_piccpy_neon.S
+ libmmal_vout_plugin_la_CFLAGS = $(AM_CFLAGS)
+-libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm -lX11 -lXrandr
++libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm
+ libmmal_vout_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES = libmmal_vout_plugin.la
+
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch
new file mode 100644
index 0000000..8806c80
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch
@@ -0,0 +1,97 @@
+From 28917a258a4173af0abda0eef7faef5cbf95f123 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 21:28:48 -0600
+Subject: [PATCH] vo: Makefile.am exclude libgl_plugin
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+In the situation where opengl isn't included in
+DISTRO_FEATURES. We need to exclude the opengl
+vout plugin from being built.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/video_output/Makefile.am | 64 --------------------------------
+ 1 file changed, 64 deletions(-)
+
+diff --git a/modules/video_output/Makefile.am b/modules/video_output/Makefile.am
+index 78c06cfc4..14a330e68 100644
+--- a/modules/video_output/Makefile.am
++++ b/modules/video_output/Makefile.am
+@@ -57,70 +57,6 @@ if HAVE_TVOS
+ vout_LTLIBRARIES += libvout_ios_plugin.la libglconv_cvpx_plugin.la
+ endif
+
+-### OpenGL ###
+-libgles2_plugin_la_SOURCES = $(OPENGL_COMMONSOURCES) video_output/opengl/display.c
+-libgles2_plugin_la_CFLAGS = $(AM_CFLAGS) $(GLES2_CFLAGS) -DUSE_OPENGL_ES2 $(OPENGL_COMMONCLFAGS)
+-libgles2_plugin_la_LIBADD = $(GLES2_LIBS) $(LIBM) $(OPENGL_COMMONLIBS)
+-libgles2_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(voutdir)'
+-
+-EXTRA_LTLIBRARIES += libgles2_plugin.la
+-vout_LTLIBRARIES += $(LTLIBgles2)
+-
+-libgl_plugin_la_SOURCES = $(OPENGL_COMMONSOURCES) video_output/opengl/display.c
+-libgl_plugin_la_CFLAGS = $(AM_CFLAGS) $(GL_CFLAGS) $(OPENGL_COMMONCLFAGS)
+-libgl_plugin_la_LIBADD = $(LIBM) $(OPENGL_COMMONLIBS)
+-if HAVE_WIN32
+-libgl_plugin_la_CFLAGS += -DHAVE_GL_CORE_SYMBOLS
+-libgl_plugin_la_LIBADD += $(GL_LIBS)
+-endif
+-
+-libglconv_vaapi_wl_plugin_la_SOURCES = video_output/opengl/converter_vaapi.c \
+- video_output/opengl/converter.h \
+- hw/vaapi/vlc_vaapi.c hw/vaapi/vlc_vaapi.h
+-libglconv_vaapi_wl_plugin_la_CFLAGS = $(AM_CFLAGS) $(GL_CFLAGS) -DHAVE_VA_WL $(LIBVA_WL_CFLAGS)
+-libglconv_vaapi_wl_plugin_la_LIBADD = $(LIBVA_LIBS) $(LIBVA_EGL_LIBS) \
+- $(LIBVA_WL_LIBS)
+-
+-libglconv_vaapi_x11_plugin_la_SOURCES = $(libglconv_vaapi_wl_plugin_la_SOURCES)
+-libglconv_vaapi_x11_plugin_la_CFLAGS = $(AM_CFLAGS) -DHAVE_VA_X11
+-libglconv_vaapi_x11_plugin_la_LIBADD = $(LIBVA_LIBS) $(LIBVA_EGL_LIBS) \
+- $(LIBVA_X11_LIBS) $(X_LIBS) $(X_PRE_LIBS) -lX11
+-
+-libglconv_vaapi_drm_plugin_la_SOURCES = $(libglconv_vaapi_wl_plugin_la_SOURCES)
+-libglconv_vaapi_drm_plugin_la_CFLAGS = $(AM_CFLAGS) -DHAVE_VA_DRM
+-libglconv_vaapi_drm_plugin_la_LIBADD = $(LIBVA_LIBS) $(LIBVA_EGL_LIBS) \
+- $(LIBVA_DRM_LIBS)
+-
+-libglconv_vdpau_plugin_la_SOURCES = video_output/opengl/converter_vdpau.c \
+- video_output/opengl/converter.h hw/vdpau/vlc_vdpau.h
+-libglconv_vdpau_plugin_la_CFLAGS = $(AM_CFLAGS) $(VDPAU_CFLAGS)
+-libglconv_vdpau_plugin_la_LIBADD = $(LIBDL) libvlc_vdpau.la $(X_LIBS) $(X_PRE_LIBS) -lX11
+-
+-if HAVE_GL
+-vout_LTLIBRARIES += libgl_plugin.la
+-if HAVE_EGL
+-if HAVE_VAAPI
+-if HAVE_WAYLAND_EGL
+-if HAVE_VAAPI_WL
+-vout_LTLIBRARIES += libglconv_vaapi_wl_plugin.la
+-endif
+-endif
+-if HAVE_XCB
+-if HAVE_VAAPI_X11
+-vout_LTLIBRARIES += libglconv_vaapi_x11_plugin.la
+-endif
+-endif
+-if HAVE_VAAPI_DRM
+-vout_LTLIBRARIES += libglconv_vaapi_drm_plugin.la
+-endif
+-endif
+-endif # HAVE_EGL
+-
+-if HAVE_VDPAU
+-vout_LTLIBRARIES += libglconv_vdpau_plugin.la
+-endif
+-endif # HAVE_GL
+-
+ ### XCB ###
+ libvlc_xcb_events_la_SOURCES = \
+ video_output/xcb/events.c video_output/xcb/events.h
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch
new file mode 100644
index 0000000..0f28199
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch
@@ -0,0 +1,59 @@
+From 35276c4b02b9114436108e74727d192f1e21f239 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 23:31:33 -0600
+Subject: [PATCH] vo: converter_vaapi Fix EGL macro undeclared
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+Fixes compiler issues related to EGL macro constant/enum value type
+not being defined
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/video_output/opengl/converter_vaapi.c | 27 +++++++++++++++++++
+ 1 file changed, 27 insertions(+)
+
+diff --git a/modules/video_output/opengl/converter_vaapi.c b/modules/video_output/opengl/converter_vaapi.c
+index cd842f711..59245fe4c 100644
+--- a/modules/video_output/opengl/converter_vaapi.c
++++ b/modules/video_output/opengl/converter_vaapi.c
+@@ -55,6 +55,33 @@
+
+ #define DRM_FORMAT_MOD_INVALID fourcc_mod_code(NONE, DRM_FORMAT_RESERVED)
+
++#define EGL_LINUX_DMA_BUF_EXT 0x3270
++#define EGL_LINUX_DRM_FOURCC_EXT 0x3271
++#define EGL_DMA_BUF_PLANE0_FD_EXT 0x3272
++#define EGL_DMA_BUF_PLANE0_OFFSET_EXT 0x3273
++#define EGL_DMA_BUF_PLANE0_PITCH_EXT 0x3274
++#define EGL_DMA_BUF_PLANE1_FD_EXT 0x3275
++#define EGL_DMA_BUF_PLANE1_OFFSET_EXT 0x3276
++#define EGL_DMA_BUF_PLANE1_PITCH_EXT 0x3277
++#define EGL_DMA_BUF_PLANE2_FD_EXT 0x3278
++#define EGL_DMA_BUF_PLANE2_OFFSET_EXT 0x3279
++#define EGL_DMA_BUF_PLANE2_PITCH_EXT 0x327A
++#define EGL_YUV_COLOR_SPACE_HINT_EXT 0x327B
++#define EGL_SAMPLE_RANGE_HINT_EXT 0x327C
++#define EGL_YUV_CHROMA_HORIZONTAL_SITING_HINT_EXT 0x327D
++#define EGL_YUV_CHROMA_VERTICAL_SITING_HINT_EXT 0x327E
++#define EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT 0x3443
++#define EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT 0x3444
++#define EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT 0x3445
++#define EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT 0x3446
++#define EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT 0x3447
++#define EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT 0x3448
++#define EGL_DMA_BUF_PLANE3_FD_EXT 0x3440
++#define EGL_DMA_BUF_PLANE3_OFFSET_EXT 0x3441
++#define EGL_DMA_BUF_PLANE3_PITCH_EXT 0x3442
++#define EGL_DMA_BUF_PLANE3_MODIFIER_LO_EXT 0x3449
++#define EGL_DMA_BUF_PLANE3_MODIFIER_HI_EXT 0x344A
++
+ struct priv
+ {
+ struct vlc_vaapi_instance *vainst;
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch
new file mode 100644
index 0000000..acfb39a
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch
@@ -0,0 +1,59 @@
+From 4caba7560aec54f6d944accd1a8d216e8d9b1d92 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Tue, 14 Nov 2023 20:17:11 -0500
+Subject: [PATCH] po: Fix typos in oc.po for gettext compatibility
+
+Upstream-Status: Inappropriate
+
+Ws moved upstream, but upstream patch couldn't be applied.
+
+https://code.videolan.org/videolan/vlc/-/commit/9d67e20c2edd25251b46d1780a7973b44ac5e5ba
+
+gettext-0.22 became stricter and started to validate format strings. Fix
+the typos.
+
+Bug: https://bugs.gentoo.org/909015
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ po/oc.po | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/po/oc.po b/po/oc.po
+index 86f2ed8a1..ce68c581f 100644
+--- a/po/oc.po
++++ b/po/oc.po
+@@ -5298,18 +5298,18 @@ msgstr "Comanda+"
+ #: src/misc/update.c:482
+ #, c-format
+ msgid "%.1f GiB"
+-msgstr "%.lf Gio"
++msgstr "%.1f Gio"
+
+ #: src/misc/update.c:484
+ #, c-format
+ msgid "%.1f MiB"
+-msgstr "%.lf Mio"
++msgstr "%.1f Mio"
+
+ #: src/misc/update.c:486 modules/gui/macosx/VLCPlaylistInfo.m:138
+ #: modules/gui/macosx/VLCPlaylistInfo.m:140
+ #, c-format
+ msgid "%.1f KiB"
+-msgstr "%.lf Kio"
++msgstr "%.1f Kio"
+
+ #: src/misc/update.c:488
+ #, c-format
+@@ -33071,7 +33071,7 @@ msgstr "Lista del gestionari de mèdias"
+
+ #, fuzzy
+ #~ msgid "%.1f kB"
+-#~ msgstr "%.lf Gio"
++#~ msgstr "%.1f Gio"
+
+ #, fuzzy
+ #~ msgid "Speed"
+--
+2.34.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb
new file mode 100644
index 0000000..2007201
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb
@@ -0,0 +1,165 @@
+DESCRIPTION = "Video player and streamer - davinci edition"
+HOMEPAGE = "http://www.videolan.org"
+SECTION = "multimedia"
+
+LICENSE = "GPL-2.0-only"
+LIC_FILES_CHKSUM = "file://COPYING;md5=b234ee4d69f5fce4486a80fdaf4a4263"
+
+SRC_URI = "\
+ git://git@github.com/RPi-Distro/vlc;protocol=https;branch=buster-rpt \
+ file://0001-configure-fix-linking-on-RISC-V-ISA.patch \
+ file://0002-Revert-configure-Require-libmodplug-0.8.9.patch \
+ file://0003-CVE-2022-41325.patch \
+ file://0004-mmal_20.patch \
+ file://0005-mmal_exit_fix.patch \
+ file://0006-mmal_chain.patch \
+ file://0007-armv6.patch \
+ file://0008-configure-Disable-incompatible-function-pointer-type.patch \
+ file://0009-demux-dash-include-cstdint-needed-for-uint64_t.patch \
+ file://2001-fix-luaL-checkint.patch \
+ file://2002-use-vorbisidec.patch \
+ file://3001-configure.ac-setup-for-OE-usage.patch \
+ file://3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch \
+ file://3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch \
+ file://3004-use-GLESv2-headers-over-GL-headers.patch \
+ file://3005-modules-remove-glspectrum-usage.patch \
+ file://3006-codec-omxil_core.h-fix-multiple-definition-of.patch \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'x11', '', 'file://3007-remove-xorg-related-link-libs.patch', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', '', 'file://3008-vo-Makefile.am-exclude-libgl_plugin.patch', d)} \
+ file://3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch \
+ file://3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch \
+ "
+
+SRCREV = "b276eb0d7bc3213363e97dbb681ef7c927be6c73"
+
+S = "${WORKDIR}/git"
+
+PROVIDES = "vlc"
+RPROVIDES:${PN} = "${PROVIDES}"
+DEPENDS = "coreutils-native fribidi libtool libgcrypt libgcrypt-native \
+ dbus libxml2 gnutls tremor faad2 ffmpeg flac alsa-lib libidn \
+ jpeg xz libmodplug mpeg2dec libmtp libopus orc libsamplerate0 \
+ avahi libusb1 schroedinger taglib tiff"
+
+inherit autotools gettext pkgconfig mime-xdg
+
+export BUILDCC = "${BUILD_CC} -std=c11"
+EXTRA_OECONF = "\
+ --enable-run-as-root \
+ --enable-xvideo \
+ --disable-lua \
+ --disable-screen \
+ --disable-caca \
+ --enable-vlm \
+ --enable-tremor \
+ --disable-aa \
+ --disable-faad \
+ --enable-dbus \
+ --without-contrib \
+ --without-kde-solid \
+ --enable-realrtsp \
+ --disable-libtar \
+ --enable-avcodec \
+ --disable-css \
+ "
+
+PACKAGECONFIG ?= "\
+ ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'x11', '', d)} \
+ ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', 'gles2', '', d)} \
+ ${@bb.utils.contains_any('DISTRO_FEATURES', 'x11', 'notify', '', d)} \
+ live555 dv1394 fontconfig fluidsynth freetype png udev \
+ x264 alsa harfbuzz jack neon fribidi dvbpsi a52 v4l2 \
+ "
+
+PACKAGECONFIG[mmal] = "--enable-omxil --enable-omxil-vout --enable-rpi-omxil --enable-mmal --enable-mmal-avcodec,,userland"
+PACKAGECONFIG[x264] = "--enable-x264,--disable-x264,x264"
+PACKAGECONFIG[mad] = "--enable-mad,--disable-mad,libmad"
+PACKAGECONFIG[a52] = "--enable-a52,--disable-a52,liba52"
+PACKAGECONFIG[jack] = "--enable-jack,--disable-jack,jack"
+PACKAGECONFIG[live555] = "--enable-live555 LIVE555_PREFIX=${STAGING_DIR_HOST}${prefix},--disable-live555,live555"
+PACKAGECONFIG[libass] = "--enable-libass,--disable-libass,libass"
+PACKAGECONFIG[postproc] = "--enable-postproc,--disable-postproc,libpostproc"
+PACKAGECONFIG[libva] = "--enable-libva,--disable-libva,libva"
+#PACKAGECONFIG[opencv] = "--enable-opencv,--disable-opencv,opencv"
+PACKAGECONFIG[speex] = "--enable-speex,--disable-speex,speex"
+PACKAGECONFIG[gstreamer] = "--enable-gst-decode,--disable-gst-decode,gstreamer1.0 gstreamer1.0-plugins-base gstreamer1.0-plugins-bad"
+PACKAGECONFIG[vpx] = "--enable-vpx,--disable-vpx, libvpx"
+#PACKAGECONFIG[freerdp] = "--enable-freerdp,--disable-freerdp, freerdp"
+PACKAGECONFIG[dvbpsi] = "--enable-dvbpsi,--disable-dvbpsi, libdvbpsi"
+#PACKAGECONFIG[samba] = "--enable-smbclient,--disable-smbclient, samba"
+PACKAGECONFIG[upnp] = "--enable-upnp,--disable-upnp,libupnp"
+PACKAGECONFIG[dvdnav] = "--enable-dvdnav,--disable-dvdnav,libdvdnav libdvdcss"
+PACKAGECONFIG[sftp] = "--enable-sftp,--disable-sftp,libssh2"
+PACKAGECONFIG[vorbis] = "--enable-vorbis,--disable-vorbis,libvorbis libogg"
+PACKAGECONFIG[ogg] = "--enable-ogg,--disable-ogg,libvorbis libogg"
+PACKAGECONFIG[dc1394] = "--enable-dc1394,--disable-dc1394,libdc1394"
+PACKAGECONFIG[dv1394] = "--enable-dv1394,--disable-dv1394,libraw1394 libavc1394"
+PACKAGECONFIG[svg] = "--enable-svg,--disable-svg,librsvg"
+PACKAGECONFIG[svgdec] = "--enable-svgdec,--disable-svgdec,librsvg cairo"
+PACKAGECONFIG[notify] = "--enable-notify,--disable-notify, libnotify gtk+3"
+PACKAGECONFIG[fontconfig] = "--enable-fontconfig,--disable-fontconfig, fontconfig"
+PACKAGECONFIG[freetype] = "--enable-freetype,--disable-freetype, freetype"
+#PACKAGECONFIG[dvdread] = "--enable-dvdread,--disable-dvdread, libdvdread libdvdcss"
+PACKAGECONFIG[vnc] = "--enable-vnc,--disable-vnc, libvncserver"
+PACKAGECONFIG[x11] = "--with-x --enable-xcb,--without-x --disable-xcb, xcb-util-keysyms libxpm libxinerama"
+PACKAGECONFIG[png] = "--enable-png,--disable-png,libpng"
+#PACKAGECONFIG[vdpau] = "--enable-vdpau,--disable-vdpau,libvdpau"
+#PACKAGECONFIG[wayland] = "--enable-wayland,--disable-wayland,wayland wayland-native"
+PACKAGECONFIG[gles2] = "--enable-gles2,--disable-gles2,virtual/libgles2"
+#PACKAGECONFIG[dca] = "--enable-dca,--disable-dca,libdca"
+PACKAGECONFIG[fribidi] = "--enable-fribidi,,fribidi"
+PACKAGECONFIG[gnutls] = "--enable-gnutls,,gnutls"
+PACKAGECONFIG[fluidsynth] = "--enable-fluidsynth,,fluidsynth"
+PACKAGECONFIG[harfbuzz] = "--enable-harfbuzz,--disable-harfbuzz,harfbuzz"
+PACKAGECONFIG[udev] = "--enable-udev,--disable-udev,udev"
+PACKAGECONFIG[neon] = "--enable-neon,--disable-neon,"
+PACKAGECONFIG[opus] = "--enable-opus,--disable-opus,libopus libogg"
+PACKAGECONFIG[ncurses] = "--enable-ncurses,--disable-ncurses,ncurses"
+PACKAGECONFIG[alsa] = "--enable-alsa,--disable-alsa,alsa-lib"
+PACKAGECONFIG[pulseaudio] = "--enable-pulse,--disable-pulse,pulseaudio"
+PACKAGECONFIG[sdl-image] = "--enable-sdl-image,,libsdl-image"
+PACKAGECONFIG[v4l2] = "--enable-v4l2,,v4l-utils"
+
+TARGET_CFLAGS:append = " -I${STAGING_INCDIR}/drm"
+TARGET_LDFLAGS:append = " ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', '-lGLESv2', '', d)}"
+
+# Ensures the --enable-mmal-avcodec flag is available for usage
+do_configure:prepend() {
+ olddir=`pwd`
+ cd ${S}
+ ./bootstrap
+ cd $olddir
+}
+
+# This recipe packages vlc as a library as well, so qt4 dependencies
+# can be avoided when only the library is installed.
+PACKAGES =+ "libvlc"
+
+LEAD_SONAME_libvlc = "libvlc.so.5"
+FILES:libvlc = "${libdir}/lib*.so.*"
+
+FILES:${PN} += "\
+ ${bindir}/vlc \
+ ${libdir}/vlc \
+ ${datadir}/applications \
+ ${datadir}/vlc \
+ ${datadir}/icons \
+ ${datadir}/metainfo/vlc.appdata.xml \
+ "
+
+FILES:${PN}-dbg += "\
+ ${libdir}/vlc/*/.debug \
+ ${libdir}/vlc/plugins/*/.debug \
+ "
+
+FILES:${PN}-staticdev += "\
+ ${libdir}/vlc/plugins/*/*.a \
+ ${libdir}/vlc/libcompat.a \
+ "
+
+# Only enable it for rpi class of machines
+COMPATIBLE_HOST = "null"
+COMPATIBLE_HOST:rpi = "(.*)"
+
+INSANE_SKIP:${PN} = "dev-so"
diff --git a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-blinka_6.2.2.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb
index 9e1e357..9e1e357 100644
--- a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-blinka_6.2.2.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb
diff --git a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-busdevice_5.0.5.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb
index 93491d4..93491d4 100644
--- a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-busdevice_5.0.5.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb
diff --git a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-motor_3.2.6.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb
index 3233c8f..3233c8f 100644
--- a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-motor_3.2.6.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb
diff --git a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-motorkit_1.6.1.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb
index 39fe76a..39fe76a 100644
--- a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-motorkit_1.6.1.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb
diff --git a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-pca9685_3.3.4.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb
index f7f0ff1..f7f0ff1 100644
--- a/dynamic-layers/openembedded-layer/recipes-devtools/python3-adafruit-circuitpython-pca9685_3.3.4.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb
diff --git a/img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png b/img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png
new file mode 100644
index 0000000..7d009bb
--- /dev/null
+++ b/img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png
Binary files differ
diff --git a/img/balena.png b/img/balena.png
index a872ce9..324c35a 100644
--- a/img/balena.png
+++ b/img/balena.png
Binary files differ
diff --git a/kas-poky-rpi.yml b/kas-poky-rpi.yml
index 2ab4770..ce59eca 100644
--- a/kas-poky-rpi.yml
+++ b/kas-poky-rpi.yml
@@ -55,7 +55,7 @@ local_conf_header:
STOPTASKS,${DL_DIR},1G,100K \
STOPTASKS,${SSTATE_DIR},1G,100K \
STOPTASKS,/tmp,100M,100K \
- ABORT,${TMPDIR},100M,1K \
- ABORT,${DL_DIR},100M,1K \
- ABORT,${SSTATE_DIR},100M,1K \
- ABORT,/tmp,10M,1K"
+ HALT,${TMPDIR},100M,1K \
+ HALT,${DL_DIR},100M,1K \
+ HALT,${SSTATE_DIR},100M,1K \
+ HALT,/tmp,10M,1K"
diff --git a/recipes-bsp/bootfiles/rpi-bootfiles.bb b/recipes-bsp/bootfiles/rpi-bootfiles.bb
index f1248ee..b04f24b 100644
--- a/recipes-bsp/bootfiles/rpi-bootfiles.bb
+++ b/recipes-bsp/bootfiles/rpi-bootfiles.bb
@@ -5,7 +5,16 @@ LIC_FILES_CHKSUM = "file://LICENCE.broadcom;md5=c403841ff2837657b2ed8e5bb474ac8d
inherit deploy nopackages
-include recipes-bsp/common/raspberrypi-firmware.inc
+RPIFW_DATE ?= "20240319"
+SRCREV = "9f24f4bc2bdd07ffd158cfbb4bce88a2efc4c1f5"
+SHORTREV = "${@d.getVar("SRCREV", False).__str__()[:7]}"
+RPIFW_SRC_URI ?= "https://api.github.com/repos/raspberrypi/firmware/tarball/9f24f4bc2bdd07ffd158cfbb4bce88a2efc4c1f5;downloadfilename=raspberrypi-firmware-${SHORTREV}.tar.gz"
+RPIFW_S ?= "${WORKDIR}/raspberrypi-firmware-${SHORTREV}"
+
+SRC_URI = "${RPIFW_SRC_URI}"
+SRC_URI[sha256sum] = "4b436f8946b139c6a1202375ef55d4848e3bcd8c1a9cb47000e06d7ecec828f7"
+
+PV = "${RPIFW_DATE}"
INHIBIT_DEFAULT_DEPS = "1"
diff --git a/recipes-bsp/bootfiles/rpi-cmdline.bb b/recipes-bsp/bootfiles/rpi-cmdline.bb
index 413ca4d..a22f50d 100644
--- a/recipes-bsp/bootfiles/rpi-cmdline.bb
+++ b/recipes-bsp/bootfiles/rpi-cmdline.bb
@@ -9,13 +9,11 @@ inherit deploy nopackages
CMDLINE_DWC_OTG ?= "dwc_otg.lpm_enable=0"
CMDLINE_ROOT_FSTYPE ?= "rootfstype=ext4"
-CMDLINE_ROOTFS ?= "root=/dev/mmcblk0p2 ${CMDLINE_ROOT_FSTYPE} rootwait"
+CMDLINE_ROOT_PARTITION ?= "/dev/mmcblk0p2"
-CMDLINE_SERIAL ?= "${@oe.utils.conditional("ENABLE_UART", "1", "console=serial0,115200", "", d)}"
-
-CMDLINE_CMA ?= "${@oe.utils.conditional("RASPBERRYPI_CAMERA_V2", "1", "cma=64M", "", d)}"
+CMDLINE_ROOTFS ?= "root=${CMDLINE_ROOT_PARTITION} ${CMDLINE_ROOT_FSTYPE} rootwait"
-CMDLINE_CMA ?= "${@oe.utils.conditional("RASPBERRYPI_HD_CAMERA", "1", "cma=64M", "", d)}"
+CMDLINE_SERIAL ?= "${@oe.utils.conditional("ENABLE_UART", "1", "console=serial0,115200", "", d)}"
CMDLINE_PITFT ?= "${@bb.utils.contains("MACHINE_FEATURES", "pitft", "fbcon=map:10 fbcon=font:VGA8x8", "", d)}"
@@ -62,7 +60,7 @@ CMDLINE = " \
"
do_compile() {
- echo "${@' '.join('${CMDLINE}'.split())}" > "${WORKDIR}/cmdline.txt"
+ echo "${@' '.join(d.getVar('CMDLINE').split())}" > "${WORKDIR}/cmdline.txt"
}
do_deploy() {
diff --git a/recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch b/recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch
new file mode 100644
index 0000000..c6c51c9
--- /dev/null
+++ b/recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch
@@ -0,0 +1,55 @@
+From ce27f7e22b2cd7453a425e08780a338a71301961 Mon Sep 17 00:00:00 2001
+From: Leon Anavi <leon.anavi@konsulko.com>
+Date: Mon, 20 Nov 2023 15:19:15 +0200
+Subject: [PATCH] config.txt: reintroduce start_x
+
+Reintroduce configuration "start_x". Based on the experience with
+Yocto/OpenEmbedded layer meta-raspberrypi, it has been observed
+that Raspberry Pi 4B 4GB may fail to enable the camera if
+"start_x=1" is at the end of the file. Therefore, "start_x=1"
+is expected in config.txt template and it has been set to replace
+the original occurrence, which is at the middle of the file.
+Also update revision and date stamp.
+
+GitHub pull request: https://github.com/Evilpaul/RPi-config/pull/8
+
+Upstream-Status: Submitted
+
+Signed-off-by: Leon Anavi <leon.anavi@konsulko.com>
+---
+ config.txt | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/config.txt b/config.txt
+index 1cf7b29..e28ed02 100644
+--- a/config.txt
++++ b/config.txt
+@@ -1,7 +1,7 @@
+ ################################################################################
+ ## Raspberry Pi Configuration Settings
+ ##
+-## Revision 17, 2021/08/15
++## Revision 18, 2023/11/20
+ ##
+ ## Details taken from the eLinux wiki and official Raspberry Pi documentation.
+ ## For up-to-date information please refer to links below.
+@@ -760,6 +760,16 @@
+ ## Camera Settings
+ ################################################################################
+
++## start_x
++## Set to "1" to enable the camera module.
++##
++## Enabling the camera requires gpu_mem option to be specified with a value
++## of at least 128.
++##
++## Default 0
++##
++#start_x=0
++
+ ## disable_camera_led
+ ## Turn off the red camera led when recording video or taking a still
+ ## picture.
+--
+2.39.2
+
diff --git a/recipes-bsp/bootfiles/rpi-config_git.bb b/recipes-bsp/bootfiles/rpi-config_git.bb
index 9d007e0..b91668f 100644
--- a/recipes-bsp/bootfiles/rpi-config_git.bb
+++ b/recipes-bsp/bootfiles/rpi-config_git.bb
@@ -7,8 +7,9 @@ LIC_FILES_CHKSUM = "file://${COMMON_LICENSE_DIR}/MIT;md5=0835ade698e0bcf8506ecda
COMPATIBLE_MACHINE = "^rpi$"
-SRCREV = "648ffc470824c43eb0d16c485f4c24816b32cd6f"
+SRCREV = "6ac2d832c6c3b208e2669f50ec1abf2c20cb7ff4"
SRC_URI = "git://github.com/Evilpaul/RPi-config.git;protocol=https;branch=master \
+ file://0001-config.txt-reintroduce-start_x.patch \
"
S = "${WORKDIR}/git"
@@ -29,6 +30,8 @@ GPIO_IR ?= "18"
GPIO_IR_TX ?= "17"
CAN_OSCILLATOR ?= "16000000"
+CAN0_INTERRUPT_PIN ?= "25"
+CAN1_INTERRUPT_PIN ?= "24"
ENABLE_UART ??= ""
@@ -178,13 +181,27 @@ do_deploy() {
fi
# UART support
- if [ "${ENABLE_UART}" = "1" ] || [ "${ENABLE_UART}" = "0" ] ; then
+ if [ "${ENABLE_UART}" = "1" ] || [ "${ENABLE_UART}" = "0" ]; then
echo "# Enable UART" >>$CONFIG
echo "enable_uart=${ENABLE_UART}" >>$CONFIG
elif [ -n "${ENABLE_UART}" ]; then
bbfatal "Invalid value for ENABLE_UART [${ENABLE_UART}]. The value for ENABLE_UART can be 0 or 1."
fi
+ # U-Boot requires "enable_uart=1" for various boards to operate correctly
+ # cf https://source.denx.de/u-boot/u-boot/-/blob/v2023.04/arch/arm/mach-bcm283x/Kconfig?ref_type=tags#L65
+ if [ "${RPI_USE_U_BOOT}" = "1" ] && [ "${ENABLE_UART}" != "1" ]; then
+ case "${UBOOT_MACHINE}" in
+ rpi_0_w_defconfig|rpi_3_32b_config|rpi_4_32b_config|rpi_arm64_config)
+ if [ "${ENABLE_UART}" = "0" ]; then
+ bbfatal "Invalid configuration: RPI_USE_U_BOOT requires to enable the UART in config.txt for ${MACHINE}"
+ fi
+ echo "# U-Boot requires UART" >>$CONFIG
+ echo "enable_uart=1" >>$CONFIG
+ ;;
+ esac
+ fi
+
# Infrared support
if [ "${ENABLE_IR}" = "1" ]; then
echo "# Enable infrared" >>$CONFIG
@@ -210,6 +227,12 @@ do_deploy() {
# echo "dtoverlay=imx477" >> $CONFIG
#fi
+ # Choose Camera Sensor to be used, default imx708 sensor
+ if [ "${RASPBERRYPI_CAMERA_V3}" = "1" ]; then
+ echo "# Enable Sony RaspberryPi Camera(imx708)" >> $CONFIG
+ echo "dtoverlay=imx708" >> $CONFIG
+ fi
+
# Waveshare "C" 1024x600 7" Rev2.1 IPS capacitive touch (http://www.waveshare.com/7inch-HDMI-LCD-C.htm)
if [ "${WAVESHARE_1024X600_C_2_1}" = "1" ]; then
echo "# Waveshare \"C\" 1024x600 7\" Rev2.1 IPS capacitive touch screen" >> $CONFIG
@@ -247,12 +270,12 @@ do_deploy() {
# ENABLE DUAL CAN
if [ "${ENABLE_DUAL_CAN}" = "1" ]; then
echo "# Enable DUAL CAN" >>$CONFIG
- echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=25" >>$CONFIG
- echo "dtoverlay=mcp2515-can1,oscillator=${CAN_OSCILLATOR},interrupt=24" >>$CONFIG
+ echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=${CAN0_INTERRUPT_PIN}" >>$CONFIG
+ echo "dtoverlay=mcp2515-can1,oscillator=${CAN_OSCILLATOR},interrupt=${CAN1_INTERRUPT_PIN}" >>$CONFIG
# ENABLE CAN
elif [ "${ENABLE_CAN}" = "1" ]; then
echo "# Enable CAN" >>$CONFIG
- echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=25" >>$CONFIG
+ echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=${CAN0_INTERRUPT_PIN}" >>$CONFIG
fi
@@ -296,6 +319,13 @@ do_deploy() {
echo "# Enable One-Wire Interface" >> $CONFIG
echo "dtoverlay=w1-gpio" >> $CONFIG
fi
+
+ # Reduce config.txt file size to avoid corruption and
+ # to boot successfully Raspberry Pi 5. The issue has
+ # been reported to related projects:
+ # https://github.com/raspberrypi/firmware/issues/1848
+ # https://github.com/Evilpaul/RPi-config/issues/9
+ sed -i '/^##/d' $CONFIG
}
do_deploy:append:raspberrypi3-64() {
diff --git a/recipes-bsp/common/raspberrypi-firmware.inc b/recipes-bsp/common/raspberrypi-firmware.inc
index e5974e4..311da21 100644
--- a/recipes-bsp/common/raspberrypi-firmware.inc
+++ b/recipes-bsp/common/raspberrypi-firmware.inc
@@ -1,9 +1,9 @@
-RPIFW_DATE ?= "20220331"
+RPIFW_DATE ?= "20230509~buster"
RPIFW_SRC_URI ?= "https://archive.raspberrypi.com/debian/pool/main/r/raspberrypi-firmware/raspberrypi-firmware_1.${RPIFW_DATE}.orig.tar.xz"
RPIFW_S ?= "${WORKDIR}/raspberrypi-firmware-1.${RPIFW_DATE}"
SRC_URI = "${RPIFW_SRC_URI}"
-SRC_URI[sha256sum] = "8758f10797bd52a7373cc5b39bd46d0d9f882d501ccb9535a72a3fe8a8d329c3"
+SRC_URI[sha256sum] = "1d9eb83111826b708f461101766fd2000d45f1c171ad573936d000f623ca8098"
PV = "${RPIFW_DATE}"
diff --git a/recipes-bsp/common/raspberrypi-tools.inc b/recipes-bsp/common/raspberrypi-tools.inc
index dc372ab..c88e7e4 100644
--- a/recipes-bsp/common/raspberrypi-tools.inc
+++ b/recipes-bsp/common/raspberrypi-tools.inc
@@ -1,5 +1,5 @@
-RPITOOLS_DATE ?= "20211101"
-SRCREV ?= "13474ee775d0c5ec8a7da4fb0a9fa84187abfc87"
+RPITOOLS_DATE ?= "20220711"
+SRCREV ?= "439b6198a9b340de5998dd14a26a0d9d38a6bcac"
RPITOOLS_SRC_URI ?= "git://github.com/raspberrypi/tools;protocol=https;branch=master"
RPITOOLS_S ?= "${WORKDIR}/git"
diff --git a/recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb b/recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb
new file mode 100644
index 0000000..2e10253
--- /dev/null
+++ b/recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb
@@ -0,0 +1,69 @@
+SUMMARY = "Installation scripts and binaries for the Raspberry Pi 4 EEPROM"
+DESCRIPTION = "This repository contains the rpi4 bootloader and scripts \
+for updating it in the spi eeprom"
+LICENSE = "BSD-3-Clause & Broadcom-RPi"
+LIC_FILES_CHKSUM = "file://LICENSE;md5=f546ed4f47e9d4c1fe954ecc9d3ef4f3"
+
+SRC_URI = " \
+ git://github.com/raspberrypi/rpi-eeprom.git;protocol=https;branch=master \
+"
+
+SRCREV = "36e58db5c2a2656e553441f4f48f32227809105d"
+PV = "v.2024.02.16-2712"
+
+S = "${WORKDIR}/git"
+
+RDEPENDS:${PN} += " \
+ coreutils \
+ python3 \
+ python3-pycryptodomex \
+ openssl \
+ xxd \
+ pciutils \
+"
+
+inherit python3native
+
+do_install() {
+ install -d ${D}${bindir}
+
+ # install executables
+ install -m 0755 ${S}/tools/vl805 ${D}${bindir}
+ install -m 0755 ${S}/rpi-eeprom-update ${D}${bindir}
+ install -m 0755 ${S}/rpi-eeprom-config ${D}${bindir}
+ install -m 0755 ${S}/rpi-eeprom-digest ${D}${bindir}
+
+ # copy firmware files
+ install -d ${D}${base_libdir}/firmware/raspberrypi/bootloader-2711/default
+ install -d ${D}${base_libdir}/firmware/raspberrypi/bootloader-2711/latest
+ install -d ${D}${base_libdir}/firmware/raspberrypi/bootloader-2712/default
+ install -d ${D}${base_libdir}/firmware/raspberrypi/bootloader-2712/latest
+
+ install -m 644 ${S}/firmware-2711/default/* ${D}${base_libdir}/firmware/raspberrypi/bootloader-2711/default
+ install -m 644 ${S}/firmware-2711/latest/* ${D}${base_libdir}/firmware/raspberrypi/bootloader-2711/latest
+ install -m 644 ${S}/firmware-2712/default/* ${D}${base_libdir}/firmware/raspberrypi/bootloader-2712/default
+ install -m 644 ${S}/firmware-2712/latest/* ${D}${base_libdir}/firmware/raspberrypi/bootloader-2712/latest
+
+ ln -s default ${D}${base_libdir}/firmware/raspberrypi/bootloader-2711/critical
+ ln -s latest ${D}${base_libdir}/firmware/raspberrypi/bootloader-2711/stable
+ ln -s latest ${D}${base_libdir}/firmware/raspberrypi/bootloader-2711/beta
+ ln -s default ${D}${base_libdir}/firmware/raspberrypi/bootloader-2712/critical
+ ln -s latest ${D}${base_libdir}/firmware/raspberrypi/bootloader-2712/stable
+ ln -s latest ${D}${base_libdir}/firmware/raspberrypi/bootloader-2712/beta
+
+ # copy default config
+ install -d ${D}${sysconfdir}/default
+ install -D ${S}/rpi-eeprom-update-default ${D}${sysconfdir}/default/rpi-eeprom-update
+}
+
+FILES:${PN} += "${base_libdir}/firmware/raspberrypi/bootloader-*"
+
+INHIBIT_PACKAGE_STRIP = "1"
+INHIBIT_PACKAGE_DEBUG_SPLIT = "1"
+
+# vl805 tool sources are not available (yet), as it comes as a precompiled
+# binary only. It has ARM architecture whereas target machine is Aarch64. We
+# need to disable arch check for it otherwise it cannot packed.
+QAPATHTEST[arch] = ""
+
+COMPATIBLE_MACHINE = "raspberrypi4|raspberrypi4-64|raspberrypi5"
diff --git a/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in b/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in
index 627d181..58fd86a 100644
--- a/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in
+++ b/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in
@@ -1,4 +1,4 @@
fdt addr ${fdt_addr} && fdt get value bootargs /chosen bootargs
-fatload mmc 0:1 ${kernel_addr_r} @@KERNEL_IMAGETYPE@@
-if test ! -e mmc 0:1 uboot.env; then saveenv; fi;
+fatload @@BOOT_MEDIA@@ 0:1 ${kernel_addr_r} @@KERNEL_IMAGETYPE@@
+if test ! -e @@BOOT_MEDIA@@ 0:1 uboot.env; then saveenv; fi;
@@KERNEL_BOOTCMD@@ ${kernel_addr_r} - ${fdt_addr}
diff --git a/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb b/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb
index 9108f71..1dff808 100644
--- a/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb
+++ b/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb
@@ -9,9 +9,12 @@ INHIBIT_DEFAULT_DEPS = "1"
SRC_URI = "file://boot.cmd.in"
+BOOT_MEDIA ?= "mmc"
+
do_compile() {
sed -e 's/@@KERNEL_IMAGETYPE@@/${KERNEL_IMAGETYPE}/' \
-e 's/@@KERNEL_BOOTCMD@@/${KERNEL_BOOTCMD}/' \
+ -e 's/@@BOOT_MEDIA@@/${BOOT_MEDIA}/' \
"${WORKDIR}/boot.cmd.in" > "${WORKDIR}/boot.cmd"
mkimage -A ${UBOOT_ARCH} -T script -C none -n "Boot script" -d "${WORKDIR}/boot.cmd" boot.scr
}
diff --git a/recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch b/recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch
index c375c40..9ea8f85 100644
--- a/recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch
+++ b/recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch
@@ -29,6 +29,8 @@ Signed-off-by: Mauro Salvini <m.salvini@koansoftware.com>
Cc: C?dric Schieli <cschieli@gmail.com>
Cc: Matthias Brugger <mbrugger@suse.com>
---
+Upstream-Status: Pending
+
board/raspberrypi/rpi/rpi.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/recipes-bsp/u-boot/u-boot_%.bbappend b/recipes-bsp/u-boot/u-boot_%.bbappend
index e50acf5..78b3e48 100644
--- a/recipes-bsp/u-boot/u-boot_%.bbappend
+++ b/recipes-bsp/u-boot/u-boot_%.bbappend
@@ -12,3 +12,6 @@ do_install:append:rpi () {
install -d ${D}${sysconfdir}
install -m 0644 ${WORKDIR}/fw_env.config ${D}${sysconfdir}/fw_env.config
}
+
+# Temporary avoid Raspberry Pi 5 because U-Boot has not been ported yet
+COMPATIBLE_MACHINE:raspberrypi5 = "(-)"
diff --git a/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch b/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch
index 3bc02c4..b019743 100644
--- a/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch
+++ b/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch
@@ -1,17 +1,19 @@
-From b4f2b77472aeb967d3a7595e8a965785c7a37c87 Mon Sep 17 00:00:00 2001
+From 8e8321cd597d3d9d342a8a3533ad10751dde5885 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Tue, 16 Feb 2016 16:40:46 +0000
-Subject: [PATCH 1/4] bcm43xx: Add bcm43xx-3wire variant
+Subject: [PATCH] bcm43xx: Add bcm43xx-3wire variant
---
+Upstream-Status: Pending
+
tools/hciattach.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/hciattach.c b/tools/hciattach.c
-index 59a76a7..5861d33 100644
+index 276a4e56e..7d01d8b74 100644
--- a/tools/hciattach.c
+++ b/tools/hciattach.c
-@@ -1144,6 +1144,9 @@ struct uart_t uart[] = {
+@@ -1078,6 +1078,9 @@ struct uart_t uart[] = {
{ "bcm43xx", 0x0000, 0x0000, HCI_UART_H4, 115200, 3000000,
FLOW_CTL, DISABLE_PM, NULL, bcm43xx, NULL },
@@ -21,6 +23,3 @@ index 59a76a7..5861d33 100644
{ "ath3k", 0x0000, 0x0000, HCI_UART_ATH3K, 115200, 115200,
FLOW_CTL, DISABLE_PM, NULL, ath3k_ps, ath3k_pm },
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch b/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch
index 5a0a434..6c13490 100644
--- a/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch
+++ b/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch
@@ -1,17 +1,20 @@
-From e145c9621f976063e5c573db1f2053d906f63427 Mon Sep 17 00:00:00 2001
+From 96e5e5eef04c6c4ae83d4d822a536cfa87605ae2 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Tue, 16 Feb 2016 16:39:09 +0000
-Subject: [PATCH 2/4] bcm43xx: The UART speed must be reset after the firmware download
+Subject: [PATCH] bcm43xx: The UART speed must be reset after the firmware
+ download
---
+Upstream-Status: Pending
+
tools/hciattach_bcm43xx.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/tools/hciattach_bcm43xx.c b/tools/hciattach_bcm43xx.c
-index 81f38cb..0b792e0 100644
+index b89fc1b50..de01a6aea 100644
--- a/tools/hciattach_bcm43xx.c
+++ b/tools/hciattach_bcm43xx.c
-@@ -366,11 +366,8 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
+@@ -350,11 +350,8 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
return -1;
if (bcm43xx_locate_patch(FIRMWARE_DIR, chip_name, fw_path)) {
@@ -24,7 +27,7 @@ index 81f38cb..0b792e0 100644
if (bcm43xx_load_firmware(fd, fw_path))
return -1;
-@@ -380,6 +377,7 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
+@@ -364,6 +361,7 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
return -1;
}
@@ -32,6 +35,3 @@ index 81f38cb..0b792e0 100644
if (bcm43xx_reset(fd))
return -1;
}
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch b/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch
index f9f09eb..1529023 100644
--- a/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch
+++ b/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch
@@ -1,17 +1,19 @@
-From d41dc2046dd08d8c95197f677e224506f5b39bdd Mon Sep 17 00:00:00 2001
+From 05c3e145b5aa62e7e759932ea99f94d495b651c3 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Wed, 20 Jan 2016 16:00:37 +0000
-Subject: [PATCH 3/4] Increase firmware load timeout to 30s
+Subject: [PATCH] Increase firmware load timeout to 30s
---
+Upstream-Status: Pending
+
tools/hciattach.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/hciattach.c b/tools/hciattach.c
-index 5861d33..4141796 100644
+index 7d01d8b74..465bb17dd 100644
--- a/tools/hciattach.c
+++ b/tools/hciattach.c
-@@ -1293,7 +1293,7 @@ int main(int argc, char *argv[])
+@@ -1227,7 +1227,7 @@ int main(int argc, char *argv[])
{
struct uart_t *u = NULL;
int detach, printpid, raw, opt, i, n, ld, err;
@@ -20,6 +22,3 @@ index 5861d33..4141796 100644
int init_speed = 0;
int send_break = 0;
pid_t pid;
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch b/recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch
deleted file mode 100644
index dadce35..0000000
--- a/recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 76681284b0ea49852041fdb97a35175089a08781 Mon Sep 17 00:00:00 2001
-From: Phil Elwell <phil@raspberrypi.org>
-Date: Tue, 23 Feb 2016 17:52:29 +0000
-Subject: [PATCH 4/4] Move the 43xx firmware into /lib/firmware
-
----
- tools/hciattach_bcm43xx.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/hciattach_bcm43xx.c b/tools/hciattach_bcm43xx.c
-index 0b792e0..207f668 100644
---- a/tools/hciattach_bcm43xx.c
-+++ b/tools/hciattach_bcm43xx.c
-@@ -43,7 +43,7 @@
- #include "hciattach.h"
-
- #ifndef FIRMWARE_DIR
--#define FIRMWARE_DIR "/etc/firmware"
-+#define FIRMWARE_DIR "/lib/firmware"
- #endif
-
- #define FW_EXT ".hcd"
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch b/recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch
new file mode 100644
index 0000000..9cf03ed
--- /dev/null
+++ b/recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch
@@ -0,0 +1,31 @@
+From 744f894e42d05b1dee917cc221ed3c1815990459 Mon Sep 17 00:00:00 2001
+From: Phil Elwell <phil@raspberrypi.org>
+Date: Tue, 23 Feb 2016 17:52:29 +0000
+Subject: [PATCH] Move the hciattach firmware into /lib/firmware
+
+* FIRMWARE_DIR is now used by all hciattach firmware (not just bcm43xx) since 5.66 with:
+ commit d9253248363b995e44c1f5e393ed1c7aa4ec81ce
+ Author: Marek Vasut <marex@denx.de>
+ Date: Tue Nov 1 12:53:33 2022 +0100
+ Subject: tools: Make hciattach_* firmware path build-time configurable
+
+Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
+---
+Upstream-Status: Pending
+
+ tools/hciattach.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/hciattach.h b/tools/hciattach.h
+index dfa4c1e7a..e88484766 100644
+--- a/tools/hciattach.h
++++ b/tools/hciattach.h
+@@ -41,7 +41,7 @@
+ #define HCI_UART_VND_DETECT 5
+
+ #ifndef FIRMWARE_DIR
+-#define FIRMWARE_DIR "/etc/firmware"
++#define FIRMWARE_DIR "/lib/firmware"
+ #endif
+
+ int read_hci_event(int fd, unsigned char *buf, int size);
diff --git a/recipes-connectivity/bluez5/bluez5_%.bbappend b/recipes-connectivity/bluez5/bluez5_%.bbappend
index 09d4103..c5d905b 100644
--- a/recipes-connectivity/bluez5/bluez5_%.bbappend
+++ b/recipes-connectivity/bluez5/bluez5_%.bbappend
@@ -4,7 +4,7 @@ SRC_URI:append:rpi = "\
file://0001-bcm43xx-Add-bcm43xx-3wire-variant.patch \
file://0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch \
file://0003-Increase-firmware-load-timeout-to-30s.patch \
- file://0004-Move-the-43xx-firmware-into-lib-firmware.patch \
+ file://0004-Move-the-hciattach-firmware-into-lib-firmware.patch \
"
RDEPENDS:${PN}:append:rpi = " pi-bluetooth"
diff --git a/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch b/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch
index 079377e..8766a77 100644
--- a/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch
+++ b/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch
@@ -3,7 +3,7 @@ From: "Peter A. Bigot" <pab@pabigot.com>
Date: Wed, 14 Nov 2018 09:19:51 -0600
Subject: [PATCH] bthelper: correct path for hciconfig under Yocto
-Upstream-Status: Inapproprate [OE-specific]
+Upstream-Status: Inappropriate [OE-specific]
Signed-off-by: Peter A. Bigot <pab@pabigot.com>
Signed-off-by: Andrei Gherzan <andrei@gherzan.ro>
diff --git a/recipes-core/psplash/files/framebuf.conf b/recipes-core/psplash/files/framebuf.conf
new file mode 100644
index 0000000..44e1ded
--- /dev/null
+++ b/recipes-core/psplash/files/framebuf.conf
@@ -0,0 +1,4 @@
+[Unit]
+Requires=sys-devices-platform-gpu-graphics-fb0.device
+After=sys-devices-platform-gpu-graphics-fb0.device
+
diff --git a/recipes-core/psplash/psplash_%.bbappend b/recipes-core/psplash/psplash_%.bbappend
index bf99b2b..57cade8 100644
--- a/recipes-core/psplash/psplash_%.bbappend
+++ b/recipes-core/psplash/psplash_%.bbappend
@@ -1,2 +1,12 @@
FILESEXTRAPATHS:prepend := "${THISDIR}/files:"
SPLASH_IMAGES:rpi = "file://psplash-raspberrypi-img.h;outsuffix=raspberrypi"
+
+SRC_URI:append:rpi = " file://framebuf.conf"
+
+do_install:append:rpi() {
+ if [ "${@bb.utils.filter('DISTRO_FEATURES', 'systemd', d)}" ]; then
+ install -Dm 0644 ${WORKDIR}/framebuf.conf ${D}${systemd_system_unitdir}/psplash-start.service.d/framebuf.conf
+ fi
+}
+
+FILES:${PN}:append:rpi = " ${systemd_system_unitdir}/psplash-start.service.d"
diff --git a/recipes-core/udev/udev-rules-rpi.bb b/recipes-core/udev/udev-rules-rpi.bb
index 42cfcdd..3ae4385 100644
--- a/recipes-core/udev/udev-rules-rpi.bb
+++ b/recipes-core/udev/udev-rules-rpi.bb
@@ -3,16 +3,17 @@ LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://${COMMON_LICENSE_DIR}/MIT;md5=0835ade698e0bcf8506ecda2f7b4f302"
SRC_URI = " \
- file://99-com.rules \
+ git://github.com/RPi-Distro/raspberrypi-sys-mods;protocol=https;branch=master \
file://can.rules \
"
+SRCREV = "5ce3ef2b7f377c23fea440ca9df0e30f3f8447cf"
-S = "${WORKDIR}"
+S = "${WORKDIR}/git"
INHIBIT_DEFAULT_DEPS = "1"
do_install () {
install -d ${D}${sysconfdir}/udev/rules.d
- install -m 0644 ${WORKDIR}/99-com.rules ${D}${sysconfdir}/udev/rules.d/
+ install -m 0644 ${S}/etc.armhf/udev/rules.d/99-com.rules ${D}${sysconfdir}/udev/rules.d/
install -m 0644 ${WORKDIR}/can.rules ${D}${sysconfdir}/udev/rules.d/
}
diff --git a/recipes-core/udev/udev-rules-rpi/99-com.rules b/recipes-core/udev/udev-rules-rpi/99-com.rules
deleted file mode 100644
index ddd1e17..0000000
--- a/recipes-core/udev/udev-rules-rpi/99-com.rules
+++ /dev/null
@@ -1,21 +0,0 @@
-KERNEL=="ttyAMA[01]", PROGRAM="/bin/sh -c '\
- ALIASES=/proc/device-tree/aliases; \
- if cmp -s $$ALIASES/uart0 $$ALIASES/serial0; then \
- echo 0;\
- elif cmp -s $$ALIASES/uart0 $$ALIASES/serial1; then \
- echo 1; \
- else \
- exit 1; \
- fi\
-'", SYMLINK+="serial%c"
-
-KERNEL=="ttyS0", PROGRAM="/bin/sh -c '\
- ALIASES=/proc/device-tree/aliases; \
- if cmp -s $$ALIASES/uart1 $$ALIASES/serial0; then \
- echo 0; \
- elif cmp -s $$ALIASES/uart1 $$ALIASES/serial1; then \
- echo 1; \
- else \
- exit 1; \
- fi \
-'", SYMLINK+="serial%c"
diff --git a/recipes-devtools/bcm2835/bcm2835_1.71.bb b/recipes-devtools/bcm2835/bcm2835_1.73.bb
index 5171205..cdf2332 100644
--- a/recipes-devtools/bcm2835/bcm2835_1.71.bb
+++ b/recipes-devtools/bcm2835/bcm2835_1.73.bb
@@ -12,8 +12,7 @@ COMPATIBLE_MACHINE = "^rpi$"
SRC_URI = "http://www.airspayce.com/mikem/bcm2835/bcm2835-${PV}.tar.gz"
-SRC_URI[md5sum] = "9bd2d39bf4b3a9e81dce799ca51c826a"
-SRC_URI[sha256sum] = "564920d205977d7e2846e434947708455d468d3a952feca9faef643abd03a227"
+SRC_URI[sha256sum] = "e67a986462618988a5a86752e36e3ebdd7c5cae66940ff7330aea243b2762525"
inherit autotools
diff --git a/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.8.bb b/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.10.bb
index 1a609f5..8ff3073 100644
--- a/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.8.bb
+++ b/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.10.bb
@@ -4,9 +4,8 @@ LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=6ec69d6e9e6c85adfb7799d7f8cf044e"
SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_Register.git;branch=main;protocol=https"
-
+SRCREV = "d1e8ac7ad9dcd65ab83749db3e5c96ffee80ebb7"
S = "${WORKDIR}/git"
-SRCREV = "49ab415d6b601c99979262f9e91c21dcb3a927a7"
DEPENDS += "python3-setuptools-scm-native"
diff --git a/recipes-devtools/python/python3-adafruit-platformdetect_3.22.1.bb b/recipes-devtools/python/python3-adafruit-platformdetect_3.27.0.bb
index e19b58e..45dc49d 100644
--- a/recipes-devtools/python/python3-adafruit-platformdetect_3.22.1.bb
+++ b/recipes-devtools/python/python3-adafruit-platformdetect_3.27.0.bb
@@ -4,8 +4,7 @@ LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=fccd531dce4b989c05173925f0bbb76c"
SRC_URI = "git://github.com/adafruit/Adafruit_Python_PlatformDetect.git;branch=main;protocol=https"
-SRCREV = "7af3af87037cf1e6697471a3a83c56a0f852b959"
-
+SRCREV = "e1460098eeca5ea573f92814691bb378e15530d9"
S = "${WORKDIR}/git"
inherit setuptools3
diff --git a/recipes-devtools/python/rpi-gpio/0001-setup.py-Use-setuptools-instead-of-distutils.patch b/recipes-devtools/python/rpi-gpio/0001-setup.py-Use-setuptools-instead-of-distutils.patch
deleted file mode 100644
index 33480b1..0000000
--- a/recipes-devtools/python/rpi-gpio/0001-setup.py-Use-setuptools-instead-of-distutils.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-From df5657d772accb275a12c1b1690befa8d87305c8 Mon Sep 17 00:00:00 2001
-From: Khem Raj <raj.khem@gmail.com>
-Date: Sat, 5 Mar 2022 09:53:41 -0800
-Subject: [PATCH] setup.py: Use setuptools instead of distutils
-
-Upstream-Status: Pending
-
-Signed-off-by: Khem Raj <raj.khem@gmail.com>
----
- setup.py | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/setup.py b/setup.py
-index 2b600ca..0fa0807 100644
---- a/setup.py
-+++ b/setup.py
-@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
- """
-
--from distutils.core import setup, Extension
-+from setuptools import setup, Extension
-
- classifiers = ['Development Status :: 5 - Production/Stable',
- 'Operating System :: POSIX :: Linux',
---
-2.35.1
-
diff --git a/recipes-devtools/python/rpi-gpio_0.7.0.bb b/recipes-devtools/python/rpi-gpio_0.7.0.bb
deleted file mode 100644
index 039d176..0000000
--- a/recipes-devtools/python/rpi-gpio_0.7.0.bb
+++ /dev/null
@@ -1,20 +0,0 @@
-DESCRIPTION = "A module to control Raspberry Pi GPIO channels"
-HOMEPAGE = "https://sourceforge.net/projects/raspberry-gpio-python/"
-SECTION = "devel/python"
-LICENSE = "MIT"
-LIC_FILES_CHKSUM = "file://LICENCE.txt;md5=9b95630a648966b142f1a0dcea001cb7"
-
-PYPI_PACKAGE = "RPi.GPIO"
-
-inherit pypi setuptools3
-
-SRC_URI += "file://0001-Remove-nested-functions.patch \
- file://0001-setup.py-Use-setuptools-instead-of-distutils.patch \
- "
-SRC_URI[sha256sum] = "7424bc6c205466764f30f666c18187a0824077daf20b295c42f08aea2cb87d3f"
-
-COMPATIBLE_MACHINE = "^rpi$"
-
-# ignore issues with -fno-common from gcc-10 until it's fixed in upstream:
-# https://sourceforge.net/p/raspberry-gpio-python/tickets/187/
-CFLAGS += "-fcommon"
diff --git a/recipes-devtools/python/rpi-gpio_0.7.1.bb b/recipes-devtools/python/rpi-gpio_0.7.1.bb
new file mode 100644
index 0000000..e7a9950
--- /dev/null
+++ b/recipes-devtools/python/rpi-gpio_0.7.1.bb
@@ -0,0 +1,15 @@
+DESCRIPTION = "A module to control Raspberry Pi GPIO channels"
+HOMEPAGE = "https://sourceforge.net/projects/raspberry-gpio-python/"
+SECTION = "devel/python"
+LICENSE = "MIT"
+LIC_FILES_CHKSUM = "file://LICENCE.txt;md5=a2294b0b1daabc30dfb5b3de73b2e00a"
+
+PYPI_PACKAGE = "RPi.GPIO"
+
+inherit pypi setuptools3
+
+SRC_URI += "file://0001-Remove-nested-functions.patch \
+ "
+SRC_URI[sha256sum] = "cd61c4b03c37b62bba4a5acfea9862749c33c618e0295e7e90aa4713fb373b70"
+
+COMPATIBLE_MACHINE = "^rpi$"
diff --git a/recipes-graphics/mesa/mesa-demos_%.bbappend b/recipes-graphics/mesa/mesa-demos_%.bbappend
index abb11ec..efcaf06 100644
--- a/recipes-graphics/mesa/mesa-demos_%.bbappend
+++ b/recipes-graphics/mesa/mesa-demos_%.bbappend
@@ -1,2 +1,3 @@
-# mesa-demos need libgles1 and userland driver does not have it
-COMPATIBLE_HOST:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '(.*)', 'null', d)}"
+# mesa-demos userland driver doesn't provide libgles1 and the EGL headers it provides break the mesa-demos build.
+# And enabling the `wayland` option without enabling `egl` is useless.
+PACKAGECONFIG:remove:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'egl gles1 wayland', d)}"
diff --git a/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch b/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch
index de9d5c3..076ba7e 100644
--- a/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch
@@ -5,7 +5,7 @@ Subject: [PATCH] gitignore: add archives from lib directory
The build creates two *.a files in the lib directory, add these to .gitignore.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
.gitignore | 1 +
diff --git a/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch b/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch
index c02a767..cce94a7 100644
--- a/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch
@@ -3,7 +3,7 @@ From: Trevor Woerner <twoerner@gmail.com>
Date: Fri, 4 Dec 2020 01:54:37 -0500
Subject: [PATCH] add "install" targets to Makefiles
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
Makefile | 3 +++
diff --git a/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch b/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch
index 7adb12b..44ed9c3 100644
--- a/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch
@@ -10,7 +10,7 @@ I get a build error saying:
Therefore switch to the more common and more generic "pkg-config" instead of
using a libpng-specific tool for flags and libraries.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 4 ++--
diff --git a/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch b/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch
index 908be62..aa83110 100644
--- a/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch
@@ -9,7 +9,7 @@ I end up with link errors of the type:
Which is caused by not having -lvchostif in the link.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 2 +-
diff --git a/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch b/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch
index ceefd03..914ffb3 100644
--- a/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch
@@ -10,7 +10,7 @@ linking so that it succeeds. Otherwise I get errors like the following:
...as well as undefined references to various other libpng objects.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 2 +-
diff --git a/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch b/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch
index dae847d..6d2de6c 100644
--- a/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch
@@ -3,7 +3,7 @@ From: Trevor Woerner <twoerner@gmail.com>
Date: Fri, 4 Dec 2020 03:47:17 -0500
Subject: [PATCH] game/Makefile: install sample png files
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 2 ++
diff --git a/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch b/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch
index b5c743e..e466a05 100644
--- a/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch
@@ -16,7 +16,7 @@ To build simply invoke 'make' with or without a -j option.
To install simply invoke: make TARGET=install
To clean simply invoke: make TARGET=clean
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
Makefile | 19 +++++++------------
diff --git a/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch b/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch
index 295309c..63f6a81 100644
--- a/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch
+++ b/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch
@@ -7,6 +7,8 @@ This patch adds provisions in userland to
let apps callers set the next rendereing dispmanx resource.
It's useful for implementing, say, a buffer carousel.
---
+Upstream-Status: Pending
+
interface/khronos/common/khrn_client_rpc.h | 2 ++
interface/khronos/common/khrn_int_ids.h | 2 ++
interface/khronos/egl/egl_client.c | 30 +++++++++++++++++++---
diff --git a/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch b/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch
index 7945bff..1a9a51c 100644
--- a/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch
+++ b/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch
@@ -19,6 +19,8 @@ vc_vchi_dispmanx.h
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
.gitignore | 1 +
CMakeLists.txt | 11 +
README.md | 4 +
diff --git a/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch b/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch
index e10f9ab..a9da68a 100644
--- a/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch
+++ b/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch
@@ -4,6 +4,8 @@ Date: Tue, 1 Oct 2013 13:19:20 +0200
Subject: [PATCH] wayland: Add Wayland example
---
+Upstream-Status: Pending
+
.../linux/apps/hello_pi/CMakeLists.txt | 1 +
.../linux/apps/hello_pi/Makefile | 2 +
.../hello_pi/hello_wayland/CMakeLists.txt | 8 +
diff --git a/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch b/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch
index 19608be..5476f41 100644
--- a/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch
+++ b/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch
@@ -9,6 +9,8 @@ lets add the dependency on bcm_host module which should do it
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/wayland-egl/wayland-egl.pc.in | 1 +
1 file changed, 1 insertion(+)
diff --git a/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch b/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch
index 2772323..8119a8c 100644
--- a/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch
+++ b/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch
@@ -9,6 +9,8 @@ This was removed after a discussion on IRC with the weston guys
Signed-off-by: "Yann E. MORIN" <yann.morin.1998@free.fr>
---
+Upstream-Status: Pending
+
interface/vmcs_host/vc_vchi_dispmanx.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch b/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch
index 5a1d8cf..8c37419 100644
--- a/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch
+++ b/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch
@@ -7,6 +7,8 @@ origins from buildroot
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client_surface.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch b/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch
index bae39e1..1e90126 100644
--- a/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch
+++ b/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch
@@ -7,6 +7,8 @@ origins from metrological wayland support
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client_surface.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch b/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch
index 1c15009..9e496c7 100644
--- a/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch
+++ b/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch
@@ -7,6 +7,8 @@ Origins from buildroot
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/ext/gl_oes_egl_image_client.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch b/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch
index 7d28453..9d8355a 100644
--- a/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch
+++ b/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch
@@ -17,6 +17,8 @@ make[2]: ***
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/vcos/pthreads/CMakeLists.txt | 8 ++++++++
interface/vmcs_host/CMakeLists.txt | 8 --------
interface/vmcs_host/vc_vchi_dispmanx.h | 2 +-
diff --git a/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch b/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch
index b6a4c58..989f417 100644
--- a/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch
+++ b/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch
@@ -7,6 +7,8 @@ frame rate appears irregular and lower than expected when using nested compositi
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch b/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch
index 0d8ccd1..691f476 100644
--- a/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch
+++ b/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch
@@ -7,6 +7,8 @@ Fixes #149
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/vmcs_host/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch b/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch
index e652cc2..87d7161 100644
--- a/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch
+++ b/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch
@@ -7,6 +7,8 @@ Courtesy: Zan Dobersek
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/ext/egl_wayland.c | 42 +++++++++++++++++++++++++++++
interface/wayland/dispmanx.xml | 10 +++++++
2 files changed, 52 insertions(+)
diff --git a/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch b/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch
index b60928a..16cbbd7 100644
--- a/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch
+++ b/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch
@@ -12,6 +12,8 @@ to two vertical intervals
Signed-off-by: Jeff Wannamaker <jeff_wannamaker@cable.comcast.com>
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client.c | 3 ++-
interface/khronos/egl/egl_client_surface.c | 8 ++++++++
interface/khronos/egl/egl_client_surface.h | 11 +++++++++++
diff --git a/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch b/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch
index fa7984c..37ca456 100644
--- a/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch
+++ b/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch
@@ -13,6 +13,8 @@ via embedded composition e.g. westeros
Signed-off-by: Jeff Wannamaker <jeff_wannamaker@cable.comcast.com>
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/common/khrn_client.c | 2 +-
interface/khronos/common/khrn_client.h | 11 +++++-
interface/khronos/ext/egl_wayland.c | 50 ++++++++++++++++++++++----
diff --git a/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch b/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch
index 8843489..94566dc 100644
--- a/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch
+++ b/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch
@@ -8,6 +8,8 @@ taken from Khronos headers
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/include/EGL/eglext.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch b/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch
index 841341e..4f91c71 100644
--- a/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch
+++ b/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch
@@ -8,6 +8,8 @@ for multiple versions of glibc even ones which does not have this define
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
opensrc/helpers/libfdt/libfdt_env.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch b/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch
index 65fc5eb..e23f4d9 100644
--- a/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch
+++ b/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch
@@ -4,6 +4,8 @@ Date: Wed, 13 Jun 2018 18:22:22 +0000
Subject: [PATCH] openmaxil: add pkg-config file
---
+Upstream-Status: Pending
+
CMakeLists.txt | 2 +-
pkgconfig/openmaxil.pc.in | 10 ++++++++++
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch b/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch
index 595eefb..7e3de5f 100644
--- a/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch
+++ b/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch
@@ -7,7 +7,7 @@ The ALL_APPS symbol will optionally build an additional set of projects,
however, several of them don't exist anymore. Remove them from the list of
ALL_APPS.
-Upstream-status: submitted [https://github.com/raspberrypi/userland/pull/661]
+Upstream-Status: Submitted [https://github.com/raspberrypi/userland/pull/661]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
host_applications/linux/CMakeLists.txt | 4 ----
diff --git a/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch b/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch
index 642ee86..e3b093a 100644
--- a/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch
+++ b/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch
@@ -5,7 +5,7 @@ Subject: [PATCH] hello_pi: optionally build wayland-specific app
Only build the wayland-specific hello_pi app when building for wayland.
-Upstream-status: inappropriate [the wayland example is not part of upstream]
+Upstream-Status: Inappropriate [the wayland example is not part of upstream]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
host_applications/linux/apps/hello_pi/CMakeLists.txt | 4 +++-
diff --git a/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch b/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch
index ec74cc2..aff95b7 100644
--- a/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch
+++ b/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch
@@ -8,6 +8,8 @@ therefore import needed defines and typedefs from latest mesa
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/include/EGL/eglext.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch b/recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch
new file mode 100644
index 0000000..6f4c722
--- /dev/null
+++ b/recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch
@@ -0,0 +1,725 @@
+From 8f7fba136391e2020cd0fc9dca76932d3faa21eb Mon Sep 17 00:00:00 2001
+From: Martin Jansa <martin.jansa@gmail.com>
+Date: Fri, 8 Mar 2024 16:29:22 +0100
+Subject: [PATCH] CMakeLists.txt, *.pc: respect CMAKE_INSTALL_LIBDIR
+
+* and CMAKE_INSTALL_BINDIR, CMAKE_INSTALL_INCLUDEDIR as well
+* fixes installation paths with multilib
+ lib32-userland fails with:
+
+ERROR: QA Issue: lib32-userland: Files/directories were installed but not shipped in any package:
+ /usr/lib/libbrcmEGL.so
+ /usr/lib/libvchiq_arm.so
+...
+ /usr/lib/pkgconfig/wayland-egl.pc
+Please set FILES such that these items are packaged. Alternatively if they are unneeded, avoid installing them or delete them within do_install.
+lib32-userland: 66 installed and not shipped files. [installed-vs-shipped]
+
+Signed-off-by: Martin Jansa <martin.jansa@gmail.com>
+---
+Upstream-Status: Pending
+
+ CMakeLists.txt | 2 +-
+ containers/CMakeLists.txt | 2 +-
+ containers/test/CMakeLists.txt | 24 +++++++++----------
+ helpers/dtoverlay/CMakeLists.txt | 2 +-
+ .../linux/apps/dtmerge/CMakeLists.txt | 2 +-
+ .../linux/apps/dtoverlay/CMakeLists.txt | 6 ++---
+ .../linux/apps/gencmd/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_audio/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_dispmanx/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_encode/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_font/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_jpeg/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_teapot/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_tiger/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_triangle/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_triangle2/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_video/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_videocube/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_wayland/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_world/CMakeLists.txt | 2 +-
+ .../linux/apps/raspicam/CMakeLists.txt | 2 +-
+ .../linux/apps/smem/CMakeLists.txt | 2 +-
+ .../linux/apps/tvservice/CMakeLists.txt | 2 +-
+ .../linux/apps/vcmailbox/CMakeLists.txt | 2 +-
+ .../linux/libs/bcm_host/CMakeLists.txt | 2 +-
+ .../linux/libs/debug_sym/CMakeLists.txt | 6 ++---
+ .../linux/libs/sm/CMakeLists.txt | 4 ++--
+ interface/khronos/CMakeLists.txt | 10 ++++----
+ interface/mmal/CMakeLists.txt | 4 ++--
+ interface/mmal/components/CMakeLists.txt | 2 +-
+ interface/mmal/core/CMakeLists.txt | 4 ++--
+ interface/mmal/util/CMakeLists.txt | 4 ++--
+ interface/mmal/vc/CMakeLists.txt | 6 ++---
+ interface/vchiq_arm/CMakeLists.txt | 4 ++--
+ interface/vcos/CMakeLists.txt | 2 +-
+ interface/vcos/generic/CMakeLists.txt | 2 +-
+ interface/vcos/pthreads/CMakeLists.txt | 4 ++--
+ interface/vmcs_host/CMakeLists.txt | 2 +-
+ makefiles/cmake/vmcs.cmake | 2 +-
+ middleware/openmaxil/CMakeLists.txt | 2 +-
+ pkgconfig/bcm_host.pc.in | 2 +-
+ pkgconfig/brcmegl.pc.in | 2 +-
+ pkgconfig/brcmglesv2.pc.in | 2 +-
+ pkgconfig/brcmvg.pc.in | 2 +-
+ pkgconfig/mmal.pc.in | 2 +-
+ pkgconfig/vcsm.pc.in | 2 +-
+ 46 files changed, 73 insertions(+), 73 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 3e3c90e..0bb54b7 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -136,7 +136,7 @@ if(PKG_CONFIG_FOUND)
+ foreach(PCFILE bcm_host.pc brcmegl.pc brcmglesv2.pc brcmvg.pc vcsm.pc mmal.pc openmaxil.pc)
+ configure_file("pkgconfig/${PCFILE}.in" "${PCFILE}" @ONLY)
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PCFILE}"
+- DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
++ DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+ endforeach()
+ endif()
+ # Remove cache entry, if one added by command line
+diff --git a/containers/CMakeLists.txt b/containers/CMakeLists.txt
+index 5570038..6c3d39c 100644
+--- a/containers/CMakeLists.txt
++++ b/containers/CMakeLists.txt
+@@ -66,7 +66,7 @@ set(packetizers_SRCS ${packetizers_SRCS} ${SOURCE_DIR}/h264/avc1_packetizer.c)
+
+ add_library(containers ${LIBRARY_TYPE} ${core_SRCS} ${io_SRCS} ${net_SRCS} ${packetizers_SRCS})
+ target_link_libraries(containers vcos)
+-install(TARGETS containers DESTINATION lib)
++install(TARGETS containers DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+ set(container_readers)
+ set(container_writers)
+diff --git a/containers/test/CMakeLists.txt b/containers/test/CMakeLists.txt
+index 7d36352..832ad0f 100644
+--- a/containers/test/CMakeLists.txt
++++ b/containers/test/CMakeLists.txt
+@@ -1,17 +1,17 @@
+ # Generate test application
+ add_executable(containers_test test.c)
+ target_link_libraries(containers_test -Wl,--no-whole-archive containers)
+-install(TARGETS containers_test DESTINATION bin)
++install(TARGETS containers_test DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate test application
+ add_executable(containers_check_frame_int check_frame_int.c)
+ target_link_libraries(containers_check_frame_int -Wl,--no-whole-archive containers)
+-install(TARGETS containers_check_frame_int DESTINATION bin)
++install(TARGETS containers_check_frame_int DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate autotest application
+ #add_executable(containers_autotest autotest.cpp crc_32.c)
+ #target_link_libraries(containers_autotest -Wl,--no-whole-archive containers})
+-#install(TARGETS containers_autotest DESTINATION bin)
++#install(TARGETS containers_autotest DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Helper code to provide non-blocking console input
+ if (WIN32)
+@@ -28,39 +28,39 @@ add_dependencies(containers_test containers_test_extra)
+ # Generate net test applications
+ add_executable(containers_stream_client stream_client.c ${NB_IO_SOURCE})
+ target_link_libraries(containers_stream_client containers)
+-install(TARGETS containers_stream_client DESTINATION bin)
++install(TARGETS containers_stream_client DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_stream_server stream_server.c)
+ target_link_libraries(containers_stream_server containers)
+-install(TARGETS containers_stream_server DESTINATION bin)
++install(TARGETS containers_stream_server DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_datagram_sender datagram_sender.c)
+ target_link_libraries(containers_datagram_sender containers)
+-install(TARGETS containers_datagram_sender DESTINATION bin)
++install(TARGETS containers_datagram_sender DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_datagram_receiver datagram_receiver.c)
+ target_link_libraries(containers_datagram_receiver containers)
+-install(TARGETS containers_datagram_receiver DESTINATION bin)
++install(TARGETS containers_datagram_receiver DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_rtp_decoder rtp_decoder.c ${NB_IO_SOURCE})
+ target_link_libraries(containers_rtp_decoder containers)
+-install(TARGETS containers_rtp_decoder DESTINATION bin)
++install(TARGETS containers_rtp_decoder DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate URI test application
+ add_executable(containers_test_uri test_uri.c)
+ target_link_libraries(containers_test_uri containers)
+-install(TARGETS containers_test_uri DESTINATION bin)
++install(TARGETS containers_test_uri DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate URI pipe application
+ add_executable(containers_uri_pipe uri_pipe.c ${NB_IO_SOURCE})
+ target_link_libraries(containers_uri_pipe containers)
+-install(TARGETS containers_uri_pipe DESTINATION bin)
++install(TARGETS containers_uri_pipe DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate bit stream test application
+ add_executable(containers_test_bits test_bits.c)
+ target_link_libraries(containers_test_bits containers)
+-install(TARGETS containers_test_bits DESTINATION bin)
++install(TARGETS containers_test_bits DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate packet file dump application
+ add_executable(containers_dump_pktfile dump_pktfile.c)
+-install(TARGETS containers_dump_pktfile DESTINATION bin)
++install(TARGETS containers_dump_pktfile DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/helpers/dtoverlay/CMakeLists.txt b/helpers/dtoverlay/CMakeLists.txt
+index b3bd30f..7e83780 100644
+--- a/helpers/dtoverlay/CMakeLists.txt
++++ b/helpers/dtoverlay/CMakeLists.txt
+@@ -22,4 +22,4 @@ add_library (dtovl ${SHARED}
+
+ target_link_libraries(dtovl fdt)
+
+-install (TARGETS dtovl DESTINATION lib)
++install (TARGETS dtovl DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/host_applications/linux/apps/dtmerge/CMakeLists.txt b/host_applications/linux/apps/dtmerge/CMakeLists.txt
+index d3f7e36..daa91e5 100755
+--- a/host_applications/linux/apps/dtmerge/CMakeLists.txt
++++ b/host_applications/linux/apps/dtmerge/CMakeLists.txt
+@@ -17,5 +17,5 @@ include_directories (
+ add_executable(dtmerge dtmerge.c)
+ target_link_libraries(dtmerge dtovl)
+
+-install(TARGETS dtmerge RUNTIME DESTINATION bin)
++install(TARGETS dtmerge RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES dtmerge.1 DESTINATION man/man1)
+diff --git a/host_applications/linux/apps/dtoverlay/CMakeLists.txt b/host_applications/linux/apps/dtoverlay/CMakeLists.txt
+index 97bcadc..238296d 100755
+--- a/host_applications/linux/apps/dtoverlay/CMakeLists.txt
++++ b/host_applications/linux/apps/dtoverlay/CMakeLists.txt
+@@ -16,12 +16,12 @@ include_directories (
+
+ add_executable(dtoverlay dtoverlay_main.c utils.c)
+ target_link_libraries(dtoverlay dtovl)
+-install(TARGETS dtoverlay RUNTIME DESTINATION bin)
++install(TARGETS dtoverlay RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES dtoverlay.1 DESTINATION man/man1)
+
+ add_custom_command(TARGET dtoverlay POST_BUILD COMMAND ln;-sf;dtoverlay;dtparam)
+-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/dtparam DESTINATION bin)
++install(FILES ${CMAKE_CURRENT_BINARY_DIR}/dtparam DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES dtparam.1 DESTINATION man/man1)
+
+ set(DTOVERLAY_SCRIPTS dtoverlay-pre dtoverlay-post)
+-install(PROGRAMS ${DTOVERLAY_SCRIPTS} DESTINATION bin)
++install(PROGRAMS ${DTOVERLAY_SCRIPTS} DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/gencmd/CMakeLists.txt b/host_applications/linux/apps/gencmd/CMakeLists.txt
+index 0c2c32a..fdd2f00 100644
+--- a/host_applications/linux/apps/gencmd/CMakeLists.txt
++++ b/host_applications/linux/apps/gencmd/CMakeLists.txt
+@@ -16,5 +16,5 @@ include_directories( ../../../..
+
+ add_executable(vcgencmd gencmd.c)
+ target_link_libraries(vcgencmd vcos vchiq_arm vchostif)
+-install(TARGETS vcgencmd RUNTIME DESTINATION bin)
++install(TARGETS vcgencmd RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES vcgencmd.1 DESTINATION man/man1)
+diff --git a/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt
+index 03207c5..8f4d06c 100644
+--- a/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt
+index 0471a1d..fd8b85e 100644
+--- a/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt
+index 147623b..98a197a 100644
+--- a/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt
+index 448d2cf..1d89f4c 100644
+--- a/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt
+@@ -6,4 +6,4 @@ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+ target_link_libraries(${EXEC} vgfont freetype z)
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt
+index a56dda5..f611f8e 100644
+--- a/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt
+index cdb8413..a60da3e 100644
+--- a/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt
+index b253f3f..1104a8b 100644
+--- a/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt
+@@ -6,4 +6,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt
+index 4e8128e..4b738bb 100644
+--- a/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt
+index 390980a..c8c534f 100644
+--- a/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt
+index 42187af..6b15ca2 100644
+--- a/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt
+index d7fb059..9612ffe 100644
+--- a/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt
+index 9a2f75c..9a468a6 100644
+--- a/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS} -lwayland-client -lwayland-egl)
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt
+index b0120fe..97d90f6 100644
+--- a/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/raspicam/CMakeLists.txt b/host_applications/linux/apps/raspicam/CMakeLists.txt
+index f73a4d0..4a9cd88 100644
+--- a/host_applications/linux/apps/raspicam/CMakeLists.txt
++++ b/host_applications/linux/apps/raspicam/CMakeLists.txt
+@@ -66,6 +66,6 @@ target_link_libraries(raspiyuv ${MMAL_LIBS} vcos bcm_host m)
+ target_link_libraries(raspivid ${MMAL_LIBS} vcos bcm_host m)
+ target_link_libraries(raspividyuv ${MMAL_LIBS} vcos bcm_host m)
+
+-install(TARGETS raspistill raspiyuv raspivid raspividyuv RUNTIME DESTINATION bin)
++install(TARGETS raspistill raspiyuv raspivid raspividyuv RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES raspistill.1 raspiyuv.1 raspivid.1 raspividyuv.1 DESTINATION man/man1)
+ install(FILES raspicam.7 DESTINATION man/man7)
+diff --git a/host_applications/linux/apps/smem/CMakeLists.txt b/host_applications/linux/apps/smem/CMakeLists.txt
+index 0fa8328..60c9c61 100644
+--- a/host_applications/linux/apps/smem/CMakeLists.txt
++++ b/host_applications/linux/apps/smem/CMakeLists.txt
+@@ -16,5 +16,5 @@ include_directories (
+ add_executable(vcsmem smem.c)
+ target_link_libraries(vcsmem vcos vcsm vchostif)
+
+-install(TARGETS vcsmem RUNTIME DESTINATION bin)
++install(TARGETS vcsmem RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+diff --git a/host_applications/linux/apps/tvservice/CMakeLists.txt b/host_applications/linux/apps/tvservice/CMakeLists.txt
+index 0190774..fad5a6b 100644
+--- a/host_applications/linux/apps/tvservice/CMakeLists.txt
++++ b/host_applications/linux/apps/tvservice/CMakeLists.txt
+@@ -3,5 +3,5 @@ add_executable(tvservice tvservice.c)
+ target_link_libraries(tvservice vchostif bcm_host)
+
+ install(TARGETS tvservice
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES tvservice.1 DESTINATION man/man1)
+diff --git a/host_applications/linux/apps/vcmailbox/CMakeLists.txt b/host_applications/linux/apps/vcmailbox/CMakeLists.txt
+index d153363..2731724 100644
+--- a/host_applications/linux/apps/vcmailbox/CMakeLists.txt
++++ b/host_applications/linux/apps/vcmailbox/CMakeLists.txt
+@@ -2,6 +2,6 @@ add_executable(vcmailbox vcmailbox.c)
+ target_link_libraries(vcmailbox vchostif)
+
+ install(TARGETS vcmailbox
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES vcmailbox.1 DESTINATION man/man1)
+ install(FILES vcmailbox.7 raspiotp.7 raspirev.7 DESTINATION man/man7)
+diff --git a/host_applications/linux/libs/bcm_host/CMakeLists.txt b/host_applications/linux/libs/bcm_host/CMakeLists.txt
+index 7a4ab06..3614943 100644
+--- a/host_applications/linux/libs/bcm_host/CMakeLists.txt
++++ b/host_applications/linux/libs/bcm_host/CMakeLists.txt
+@@ -19,5 +19,5 @@ add_library(bcm_host ${SHARED} bcm_host.c)
+
+ target_link_libraries(bcm_host vcos vchostif)
+
+-install(TARGETS bcm_host DESTINATION lib)
++install(TARGETS bcm_host DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+diff --git a/host_applications/linux/libs/debug_sym/CMakeLists.txt b/host_applications/linux/libs/debug_sym/CMakeLists.txt
+index d437b99..37eb759 100644
+--- a/host_applications/linux/libs/debug_sym/CMakeLists.txt
++++ b/host_applications/linux/libs/debug_sym/CMakeLists.txt
+@@ -11,6 +11,6 @@ include_directories (
+ add_library(debug_sym ${SHARED} debug_sym.c)
+ add_library(debug_sym_static STATIC debug_sym.c)
+
+-install(TARGETS debug_sym DESTINATION lib)
+-install(TARGETS debug_sym_static DESTINATION lib)
+-install(FILES debug_sym.h DESTINATION include/interface/debug_sym)
++install(TARGETS debug_sym DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(TARGETS debug_sym_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(FILES debug_sym.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/debug_sym)
+diff --git a/host_applications/linux/libs/sm/CMakeLists.txt b/host_applications/linux/libs/sm/CMakeLists.txt
+index 5ce5aca..84d8123 100644
+--- a/host_applications/linux/libs/sm/CMakeLists.txt
++++ b/host_applications/linux/libs/sm/CMakeLists.txt
+@@ -14,5 +14,5 @@ add_library(vcsm ${SHARED} user-vcsm.c)
+
+ target_link_libraries(vcsm vcos)
+
+-install(TARGETS vcsm DESTINATION lib)
+-install(FILES user-vcsm.h DESTINATION include/interface/vcsm)
++install(TARGETS vcsm DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(FILES user-vcsm.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/vcsm)
+diff --git a/interface/khronos/CMakeLists.txt b/interface/khronos/CMakeLists.txt
+index 95c0e11..00316a5 100644
+--- a/interface/khronos/CMakeLists.txt
++++ b/interface/khronos/CMakeLists.txt
+@@ -94,11 +94,11 @@ if (BUILD_WAYLAND)
+ )
+
+ add_library(wayland-egl ${SHARED} ${WAYLAND_EGL_SOURCE})
+- install(TARGETS wayland-egl DESTINATION lib)
++ install(TARGETS wayland-egl DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+ configure_file ("wayland-egl/wayland-egl.pc.in" "wayland-egl/wayland-egl.pc" @ONLY)
+ install (FILES "${CMAKE_CURRENT_BINARY_DIR}/wayland-egl/wayland-egl.pc"
+- DESTINATION lib/pkgconfig)
++ DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+ endif ()
+
+ add_library(EGL ${SHARED} ${EGL_SOURCE})
+@@ -126,8 +126,8 @@ target_link_libraries(GLESv2 EGL khrn_client vcos)
+ target_link_libraries(WFC EGL)
+ target_link_libraries(OpenVG EGL)
+
+-install(TARGETS EGL GLESv2 OpenVG WFC khrn_client DESTINATION lib)
+-install(TARGETS EGL_static GLESv2_static khrn_static DESTINATION lib)
++install(TARGETS EGL GLESv2 OpenVG WFC khrn_client DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(TARGETS EGL_static GLESv2_static khrn_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+ # recommended names to use to avoid conflicts with mesa libs
+ add_library(brcmEGL ${SHARED} ${EGL_SOURCE})
+@@ -140,4 +140,4 @@ target_link_libraries(brcmGLESv2 brcmEGL khrn_client vcos)
+ target_link_libraries(brcmWFC brcmEGL)
+ target_link_libraries(brcmOpenVG brcmEGL)
+
+-install(TARGETS brcmEGL brcmGLESv2 brcmOpenVG brcmWFC DESTINATION lib)
++install(TARGETS brcmEGL brcmGLESv2 brcmOpenVG brcmWFC DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/interface/mmal/CMakeLists.txt b/interface/mmal/CMakeLists.txt
+index c5c1642..fe784e8 100644
+--- a/interface/mmal/CMakeLists.txt
++++ b/interface/mmal/CMakeLists.txt
+@@ -16,7 +16,7 @@ add_subdirectory(client)
+
+ target_link_libraries(mmal mmal_core mmal_util mmal_vc_client vcos mmal_components)
+
+-install(TARGETS mmal DESTINATION lib)
++install(TARGETS mmal DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal.h
+ mmal_buffer.h
+@@ -36,7 +36,7 @@ install(FILES
+ mmal_pool.h mmal_port.h
+ mmal_queue.h
+ mmal_types.h
+- DESTINATION include/interface/mmal
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal
+ )
+
+ # Test apps
+diff --git a/interface/mmal/components/CMakeLists.txt b/interface/mmal/components/CMakeLists.txt
+index d65fa37..4c85de0 100644
+--- a/interface/mmal/components/CMakeLists.txt
++++ b/interface/mmal/components/CMakeLists.txt
+@@ -30,5 +30,5 @@ set(container_libs ${container_libs} containers)
+ target_link_libraries(mmal_components ${container_libs} mmal_util)
+ target_link_libraries(mmal_components mmal_core)
+
+-install(TARGETS mmal_components DESTINATION lib)
++install(TARGETS mmal_components DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+diff --git a/interface/mmal/core/CMakeLists.txt b/interface/mmal/core/CMakeLists.txt
+index efa14d9..4fe0779 100644
+--- a/interface/mmal/core/CMakeLists.txt
++++ b/interface/mmal/core/CMakeLists.txt
+@@ -13,7 +13,7 @@ add_library (mmal_core ${LIBRARY_TYPE}
+
+ target_link_libraries (mmal_core vcos mmal_vc_client)
+
+-install(TARGETS mmal_core DESTINATION lib)
++install(TARGETS mmal_core DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal_buffer_private.h
+ mmal_clock_private.h
+@@ -21,5 +21,5 @@ install(FILES
+ mmal_core_private.h
+ mmal_port_private.h
+ mmal_events_private.h
+- DESTINATION include/interface/mmal/core
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal/core
+ )
+diff --git a/interface/mmal/util/CMakeLists.txt b/interface/mmal/util/CMakeLists.txt
+index b2a6858..e51afd0 100644
+--- a/interface/mmal/util/CMakeLists.txt
++++ b/interface/mmal/util/CMakeLists.txt
+@@ -12,7 +12,7 @@ add_library (mmal_util ${LIBRARY_TYPE}
+
+ target_link_libraries (mmal_util vcos)
+
+-install(TARGETS mmal_util DESTINATION lib)
++install(TARGETS mmal_util DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal_component_wrapper.h
+ mmal_connection.h
+@@ -24,5 +24,5 @@ install(FILES
+ mmal_util.h
+ mmal_util_params.h
+ mmal_util_rational.h
+- DESTINATION include/interface/mmal/util
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal/util
+ )
+diff --git a/interface/mmal/vc/CMakeLists.txt b/interface/mmal/vc/CMakeLists.txt
+index d6e80db..3b9ec64 100644
+--- a/interface/mmal/vc/CMakeLists.txt
++++ b/interface/mmal/vc/CMakeLists.txt
+@@ -8,12 +8,12 @@ target_link_libraries(mmal_vc_client vchiq_arm vcos vcsm)
+ if(BUILD_MMAL_APPS)
+ add_executable(mmal_vc_diag mmal_vc_diag.c)
+ target_link_libraries(mmal_vc_diag mmal mmal_vc_client debug_sym vcos)
+-install(TARGETS mmal_vc_diag RUNTIME DESTINATION bin)
++install(TARGETS mmal_vc_diag RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ endif(BUILD_MMAL_APPS)
+
+ include_directories ( ../../../host_applications/linux/libs/sm )
+
+-install(TARGETS mmal_vc_client DESTINATION lib)
++install(TARGETS mmal_vc_client DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal_vc_api.h
+ mmal_vc_api_drm.h
+@@ -22,5 +22,5 @@ install(FILES
+ mmal_vc_msgs.h
+ mmal_vc_opaque_alloc.h
+ mmal_vc_shm.h
+- DESTINATION include/interface/mmal/vc
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal/vc
+ )
+diff --git a/interface/vchiq_arm/CMakeLists.txt b/interface/vchiq_arm/CMakeLists.txt
+index 7af383d..e5a3224 100644
+--- a/interface/vchiq_arm/CMakeLists.txt
++++ b/interface/vchiq_arm/CMakeLists.txt
+@@ -5,7 +5,7 @@ add_library(vchiq_arm SHARED
+ # pull in VCHI cond variable emulation
+ target_link_libraries(vchiq_arm vcos)
+
+-install(TARGETS vchiq_arm DESTINATION lib)
++install(TARGETS vchiq_arm DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ #install(FILES etc/10-vchiq.rules DESTINATION /etc/udev/rules.d)
+
+ include_directories(../..)
+@@ -17,4 +17,4 @@ target_link_libraries(vchiq_test
+ vchiq_arm
+ vcos)
+
+-install(TARGETS vchiq_test RUNTIME DESTINATION bin)
++install(TARGETS vchiq_test RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/interface/vcos/CMakeLists.txt b/interface/vcos/CMakeLists.txt
+index 23a8d72..b0924a4 100644
+--- a/interface/vcos/CMakeLists.txt
++++ b/interface/vcos/CMakeLists.txt
+@@ -65,4 +65,4 @@ if (WIN32)
+ configure_file (build_all.bat.in build_all.bat @ONLY)
+ endif ()
+
+-#install (FILES ${HEADERS} DESTINATION include/interface/vcos)
++#install (FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/vcos)
+diff --git a/interface/vcos/generic/CMakeLists.txt b/interface/vcos/generic/CMakeLists.txt
+index c09f376..8af98fd 100644
+--- a/interface/vcos/generic/CMakeLists.txt
++++ b/interface/vcos/generic/CMakeLists.txt
+@@ -18,4 +18,4 @@ foreach (header ${HEADERS})
+ configure_file ("${header}" "${VCOS_HEADERS_BUILD_DIR}/generic/${header}" COPYONLY)
+ endforeach ()
+
+-install (FILES ${HEADERS} DESTINATION include/interface/vcos/generic)
++install (FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/vcos/generic)
+diff --git a/interface/vcos/pthreads/CMakeLists.txt b/interface/vcos/pthreads/CMakeLists.txt
+index d6cd415..821b3f3 100644
+--- a/interface/vcos/pthreads/CMakeLists.txt
++++ b/interface/vcos/pthreads/CMakeLists.txt
+@@ -50,5 +50,5 @@ else ()
+ endif ()
+
+
+-#install(FILES ${HEADERS} DESTINATION include)
+-install(TARGETS vcos DESTINATION lib)
++#install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(TARGETS vcos DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/interface/vmcs_host/CMakeLists.txt b/interface/vmcs_host/CMakeLists.txt
+index 76813c9..0984d8a 100755
+--- a/interface/vmcs_host/CMakeLists.txt
++++ b/interface/vmcs_host/CMakeLists.txt
+@@ -35,5 +35,5 @@ target_link_libraries(vchostif vchiq_arm vcos)
+
+ #target_link_libraries(bufman WFC)
+
+-install(TARGETS ${INSTALL_TARGETS} DESTINATION lib)
++install(TARGETS ${INSTALL_TARGETS} DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+diff --git a/makefiles/cmake/vmcs.cmake b/makefiles/cmake/vmcs.cmake
+index 7c97463..a1eb911 100644
+--- a/makefiles/cmake/vmcs.cmake
++++ b/makefiles/cmake/vmcs.cmake
+@@ -16,7 +16,7 @@ endif()
+ SET(CMAKE_INSTALL_PREFIX "${VMCS_INSTALL_PREFIX}" CACHE INTERNAL "Prefix
+ prepended to install directories" FORCE)
+ if(NOT DEFINED VMCS_PLUGIN_DIR)
+- SET(VMCS_PLUGIN_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_SHARED_LIBRARY_PREFIX}/plugins)
++ SET(VMCS_PLUGIN_DIR ${CMAKE_INSTALL_LIBDIR}/plugins)
+ endif()
+
+ # What kind of system are we?
+diff --git a/middleware/openmaxil/CMakeLists.txt b/middleware/openmaxil/CMakeLists.txt
+index 3e9c5f9..c063740 100644
+--- a/middleware/openmaxil/CMakeLists.txt
++++ b/middleware/openmaxil/CMakeLists.txt
+@@ -49,4 +49,4 @@ else ()
+
+ endif ()
+
+-install (TARGETS openmaxil DESTINATION lib)
++install (TARGETS openmaxil DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/pkgconfig/bcm_host.pc.in b/pkgconfig/bcm_host.pc.in
+index c7237c5..2988b42 100644
+--- a/pkgconfig/bcm_host.pc.in
++++ b/pkgconfig/bcm_host.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: bcm_host
+diff --git a/pkgconfig/brcmegl.pc.in b/pkgconfig/brcmegl.pc.in
+index 5dd3d5b..a45bf22 100644
+--- a/pkgconfig/brcmegl.pc.in
++++ b/pkgconfig/brcmegl.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: brcmEGL
+diff --git a/pkgconfig/brcmglesv2.pc.in b/pkgconfig/brcmglesv2.pc.in
+index e0e36f5..902fbf3 100644
+--- a/pkgconfig/brcmglesv2.pc.in
++++ b/pkgconfig/brcmglesv2.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: brcmGLESv2
+diff --git a/pkgconfig/brcmvg.pc.in b/pkgconfig/brcmvg.pc.in
+index 763a44b..98489ee 100644
+--- a/pkgconfig/brcmvg.pc.in
++++ b/pkgconfig/brcmvg.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: brcmOpenVG
+diff --git a/pkgconfig/mmal.pc.in b/pkgconfig/mmal.pc.in
+index 37d344c..1ffa4f5 100644
+--- a/pkgconfig/mmal.pc.in
++++ b/pkgconfig/mmal.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: MMAL
+diff --git a/pkgconfig/vcsm.pc.in b/pkgconfig/vcsm.pc.in
+index b12c56f..6f762cb 100644
+--- a/pkgconfig/vcsm.pc.in
++++ b/pkgconfig/vcsm.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: VCSM
diff --git a/recipes-graphics/userland/userland_git.bb b/recipes-graphics/userland/userland_git.bb
index d8265d4..bd50bf8 100644
--- a/recipes-graphics/userland/userland_git.bb
+++ b/recipes-graphics/userland/userland_git.bb
@@ -13,11 +13,11 @@ COMPATIBLE_MACHINE = "^rpi$"
SRCBRANCH = "master"
SRCFORK = "raspberrypi"
-SRCREV = "c4fd1b8986c6d6d4ae5cd51e65a8bbeb495dfa4e"
+SRCREV = "cc1ca18fb0689b01cc2ca2aa4b400dcee624a213"
# Use the date of the above commit as the package version. Update this when
# SRCREV is changed.
-PV = "20220323"
+PV = "20230419"
SRC_URI = "\
git://github.com/${SRCFORK}/userland.git;protocol=https;branch=${SRCBRANCH} \
@@ -46,6 +46,7 @@ SRC_URI = "\
file://0022-all-host_applications-remove-non-existent-projects.patch \
file://0023-hello_pi-optionally-build-wayland-specific-app.patch \
file://0024-userland-Sync-needed-defines-for-weston-build.patch \
+ file://0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch \
"
SRC_URI:remove:toolchain-clang = "file://0021-cmake-Disable-format-overflow-warning-as-error.patch"
diff --git a/recipes-graphics/wayland/weston_%.bbappend b/recipes-graphics/wayland/weston_%.bbappend
index 89917f0..f9ed06a 100644
--- a/recipes-graphics/wayland/weston_%.bbappend
+++ b/recipes-graphics/wayland/weston_%.bbappend
@@ -1,4 +1,4 @@
-PACKAGECONFIG:remove:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'fbdev', '', d)}"
+PACKAGECONFIG:remove:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'fbdev', 'egl clients', d)}"
EXTRA_OECONF:append:rpi = " \
--disable-xwayland-test \
diff --git a/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb b/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb
index b26e80e..bd5ed62 100644
--- a/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb
+++ b/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb
@@ -16,16 +16,18 @@ SECTION = "kernel"
# [^1]: https://github.com/RPi-Distro/bluez-firmware/issues/1
LICENSE = "Firmware-cypress-rpidistro"
LIC_FILES_CHKSUM = "\
- file://LICENCE.cypress-rpidistro;md5=c5d12ae0b24ef7177902a8e288751a4e \
+ file://LICENCE.cypress-rpidistro;md5=be80828daf682762f392131141288a74 \
"
# These are not common licenses, set NO_GENERIC_LICENSE for them
# so that the license files will be copied from fetched source
NO_GENERIC_LICENSE[Firmware-cypress-rpidistro] = "LICENCE.cypress-rpidistro"
-SRC_URI = "git://github.com/RPi-Distro/bluez-firmware;branch=master;protocol=https"
-SRCREV = "e7fd166981ab4bb9a36c2d1500205a078a35714d"
-PV = "1.2-4+rpt8"
+SRC_URI = " \
+ git://github.com/RPi-Distro/bluez-firmware;branch=bookworm;protocol=https \
+"
+SRCREV = "78d6a07730e2d20c035899521ab67726dc028e1c"
+PV = "1.2-9+rpt3"
S = "${WORKDIR}/git"
@@ -49,19 +51,21 @@ do_install() {
install -d ${D}${nonarch_base_libdir}/firmware/brcm
cp LICENCE.cypress-rpidistro ${D}${nonarch_base_libdir}/firmware
- install -m 0644 broadcom/BCM434*.hcd ${D}${nonarch_base_libdir}/firmware/brcm/
+ install -m 0644 debian/firmware/broadcom/BCM434*.hcd ${D}${nonarch_base_libdir}/firmware/brcm/
}
PACKAGES = "\
${PN}-cypress-license \
${PN}-bcm43430a1-hcd \
${PN}-bcm43430b0-hcd \
+ ${PN}-bcm4343a2-hcd \
${PN}-bcm4345c0-hcd \
${PN}-bcm4345c5-hcd \
"
LICENSE:${PN}-bcm43430a1-hcd = "Firmware-cypress-rpidistro"
LICENSE:${PN}-bcm43430b0-hcd = "Firmware-cypress-rpidistro"
+LICENSE:${PN}-bcm4343a2-hcd = "Firmware-cypress-rpidistro"
LICENSE:${PN}-bcm4345c0-hcd = "Firmware-cypress-rpidistro"
LICENSE:${PN}-bcm4345c5-hcd = "Firmware-cypress-rpidistro"
LICENSE:${PN}-cypress-license = "Firmware-cypress-rpidistro"
@@ -75,6 +79,9 @@ FILES:${PN}-bcm43430a1-hcd = "\
FILES:${PN}-bcm43430b0-hcd = "\
${nonarch_base_libdir}/firmware/brcm/BCM43430B0.hcd \
"
+FILES:${PN}-bcm4343a2-hcd = "\
+ ${nonarch_base_libdir}/firmware/brcm/BCM4343A2.hcd \
+"
FILES:${PN}-bcm4345c0-hcd = "\
${nonarch_base_libdir}/firmware/brcm/BCM4345C0.hcd \
"
@@ -84,12 +91,15 @@ FILES:${PN}-bcm4345c5-hcd = "\
RDEPENDS:${PN}-bcm43430a1-hcd += "${PN}-cypress-license"
RDEPENDS:${PN}-bcm43430b0-hcd += "${PN}-cypress-license"
+RDEPENDS:${PN}-bcm4343a2-hcd += "${PN}-cypress-license"
RDEPENDS:${PN}-bcm4345c0-hcd += "${PN}-cypress-license"
RDEPENDS:${PN}-bcm4345c5-hcd += "${PN}-cypress-license"
RCONFLICTS:${PN}-bcm43430a1-hcd = "linux-firmware-bcm43430a1-hcd"
RREPLACES:${PN}-bcm43430a1-hcd = "linux-firmware-bcm43430a1-hcd"
RCONFLICTS:${PN}-bcm43430b0-hcd = "linux-firmware-bcm43430b0-hcd"
RREPLACES:${PN}-bcm43430b0-hcd = "linux-firmware-bcm43430b0-hcd"
+RCONFLICTS:${PN}-bcm4343a2-hcd = "linux-firmware-bcm4343a2-hcd"
+RREPLACES:${PN}-bcm4343a2-hcd = "linux-firmware-bcm4343a2-hcd"
RCONFLICTS:${PN}-bcm43435c0-hcd = "linux-firmware-bcm4345c0-hcd"
RREPLACES:${PN}-bcm43435c0-hcd = "linux-firmware-bcm4345c0-hcd"
RCONFLICTS:${PN}-bcm43435c5-hcd = "linux-firmware-bcm4345c5-hcd"
diff --git a/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch
new file mode 100644
index 0000000..f67d95b
--- /dev/null
+++ b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch
@@ -0,0 +1,28 @@
+From b9db43e36ad0942d33cb4db5b394abd722862568 Mon Sep 17 00:00:00 2001
+From: Andrei Gherzan <andrei.gherzan@huawei.com>
+Date: Fri, 9 Sep 2022 20:28:06 +0200
+Subject: [PATCH] Default 43455 firmware to standard variant
+
+The firmware for 43455 is loaded as a symlink: brcmfmac43455-sdio.bin.
+This symlink is now broken as the debian package handles the right
+target of this symlink through a postinstall. We don't have that logic
+here so we default to the standard variant.
+
+Upstream-Status: Inappropriate [issue reported at https://github.com/RPi-Distro/firmware-nonfree/issues/26]
+Signed-off-by: Andrei Gherzan <andrei.gherzan@huawei.com>
+---
+ debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin b/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin
+index 9c39208..b914838 120000
+--- a/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin
++++ b/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin
+@@ -1 +1 @@
+-../cypress/cyfmac43455-sdio.bin
+\ No newline at end of file
++../cypress/cyfmac43455-sdio-standard.bin
+\ No newline at end of file
+--
+2.25.1
+
diff --git a/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb
index 81ccc2c..959513d 100644
--- a/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb
+++ b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb
@@ -5,139 +5,103 @@ to linux-firmware for general use."
HOMEPAGE = "https://github.com/RPi-Distro/firmware-nonfree"
SECTION = "kernel"
-# In maintained upstream linux-firmware:
-# * brcmfmac43430-sdio falls under LICENSE.cypress
-# * brcmfmac43455-sdio falls under LICENSE.broadcom_bcm43xx
-# * brcmfmac43456-sdio falls under LICENSE.broadcom_bcm43xx
-#
-# It is likely[^1] that both of these should be under LICENSE.cypress.
-# Further, at this time the text of LICENSE.broadcom_bcm43xx is the same
-# in linux-firmware and RPi-Distro/firmware-nonfree, but this may
-# change.
-#
-# Rather than make assumptions about what's supposed to be what, we'll
-# use the license implied by the source of these files, named to avoid
-# conflicts with linux-firmware.
-#
-# [^1]: https://github.com/RPi-Distro/bluez-firmware/issues/1
-LICENSE = "\
- Firmware-broadcom_bcm43xx-rpidistro \
-"
+LICENSE = "GPL-2.0-only & binary-redist-Cypress-rpidistro & Synaptics-rpidistro"
LIC_FILES_CHKSUM = "\
- file://debian/config/brcm80211/LICENSE;md5=8cba1397cda6386db37210439a0da3eb \
+ file://debian/copyright;md5=291ee5385b4cf74b10c5fb5a46a7bbc6 \
"
-
-# These are not common licenses, set NO_GENERIC_LICENSE for them
-# so that the license files will be copied from fetched source
-NO_GENERIC_LICENSE[Firmware-broadcom_bcm43xx-rpidistro] = "debian/config/brcm80211/LICENSE"
-
-SRC_URI = "git://github.com/RPi-Distro/firmware-nonfree;branch=bullseye;protocol=https"
-
-SRCREV = "99d5c588e95ec9c9b86d7e88d3cf85b4f729d2bc"
-PV = "20210315-3+rpt4"
-
+# Where these are no common licenses, set NO_GENERIC_LICENSE so that the
+# license files will be copied from the fetched source.
+NO_GENERIC_LICENSE[binary-redist-Cypress-rpidistro] = "debian/copyright"
+NO_GENERIC_LICENSE[Synaptics-rpidistro] = "debian/copyright"
+LICENSE_FLAGS = "synaptics-killswitch"
+
+SRC_URI = "git://github.com/RPi-Distro/firmware-nonfree;branch=bookworm;protocol=https \
+ file://0001-Default-43455-firmware-to-standard-variant.patch \
+"
+SRCREV = "223ccf3a3ddb11b3ea829749fbbba4d65b380897"
+PV = "20230625-2+rpt2"
S = "${WORKDIR}/git"
inherit allarch
-CLEANBROKEN = "1"
-
-do_compile() {
- :
-}
+do_configure[noexec] = "1"
+do_compile[noexec] = "1"
do_install() {
install -d ${D}${nonarch_base_libdir}/firmware/brcm ${D}${nonarch_base_libdir}/firmware/cypress
- cp debian/config/brcm80211/LICENSE ${D}${nonarch_base_libdir}/firmware/LICENSE.broadcom_bcm43xx-rpidistro
+ cp debian/copyright ${D}${nonarch_base_libdir}/firmware/copyright.firmware-nonfree-rpidistro
- # Replace outdated linux-firmware files with updated ones from
- # raspbian firmware-nonfree. Raspbian adds blobs and nvram
- # definitions that are also necessary so copy those too.
- for fw in brcmfmac43430-sdio brcmfmac43436-sdio brcmfmac43436s-sdio brcmfmac43455-sdio brcmfmac43456-sdio ; do
+ for fw in \
+ brcmfmac43430-sdio \
+ brcmfmac43436-sdio \
+ brcmfmac43436s-sdio \
+ brcmfmac43455-sdio \
+ brcmfmac43456-sdio; do
cp -R --no-dereference --preserve=mode,links -v debian/config/brcm80211/brcm/${fw}.* ${D}${nonarch_base_libdir}/firmware/brcm/
done
+
cp -R --no-dereference --preserve=mode,links -v debian/config/brcm80211/cypress/* ${D}${nonarch_base_libdir}/firmware/cypress/
+
rm ${D}${nonarch_base_libdir}/firmware/cypress/README.txt
- # add compat links. Fixes errors like
- # brcmfmac mmc1:0001:1: Direct firmware load for brcm/brcmfmac43455-sdio.raspberrypi,4-model-compute-module.txt failed with error -2
- ln -s brcmfmac43455-sdio.txt ${D}${nonarch_base_libdir}/firmware/brcm/brcmfmac43455-sdio.raspberrypi,4-compute-module.txt
- # brcmfmac mmc1:0001:1: Direct firmware load for brcm/brcmfmac43455-sdio.raspberrypi,4-model-b.bin failed with error -2
- ln -s brcmfmac43455-sdio.bin ${D}${nonarch_base_libdir}/firmware/brcm/brcmfmac43455-sdio.raspberrypi,4-model-b.bin
}
PACKAGES = "\
- ${PN}-broadcom-license \
${PN}-bcm43430 \
- ${PN}-bcm43455 \
- ${PN}-bcm43456 \
${PN}-bcm43436 \
${PN}-bcm43436s \
+ ${PN}-bcm43439 \
+ ${PN}-bcm43455 \
+ ${PN}-bcm43456 \
+ ${PN}-license \
"
-LICENSE:${PN}-bcm43430 = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE:${PN}-bcm43436 = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE:${PN}-bcm43436s = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE:${PN}-bcm43455 = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE:${PN}-bcm43456 = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE:${PN}-broadcom-license = "Firmware-broadcom_bcm43xx-rpidistro"
-FILES:${PN}-broadcom-license = "${nonarch_base_libdir}/firmware/LICENSE.broadcom_bcm43xx-rpidistro"
-FILES:${PN}-bcm43430 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43430* ${nonarch_base_libdir}/firmware/cypress/cyfmac43430-sdio.bin ${nonarch_base_libdir}/firmware/cypress/cyfmac43430-sdio.clm_blob"
+LICENSE:${PN}-bcm43430 = "binary-redist-Cypress-rpidistro"
+LICENSE:${PN}-bcm43436 = "Synaptics-rpidistro"
+LICENSE:${PN}-bcm43436s = "Synaptics-rpidistro"
+LICENSE:${PN}-bcm43439 = "Synaptics-rpidistro"
+LICENSE:${PN}-bcm43455 = "binary-redist-Cypress-rpidistro"
+LICENSE:${PN}-bcm43456 = "Synaptics-rpidistro"
+LICENSE:${PN}-license = "GPL-2.0-only"
+
+FILES:${PN}-bcm43430 = " \
+ ${nonarch_base_libdir}/firmware/brcm/brcmfmac43430* \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43430-sdio.bin \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43430-sdio.clm_blob \
+"
FILES:${PN}-bcm43436 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43436-*"
FILES:${PN}-bcm43436s = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43436s*"
-FILES:${PN}-bcm43455 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43455* ${nonarch_base_libdir}/firmware/cypress/cyfmac43455-sdio*"
-FILES:${PN}-bcm43456 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43456*"
-RDEPENDS:${PN}-bcm43430 += "${PN}-broadcom-license"
-RDEPENDS:${PN}-bcm43436 += "${PN}-broadcom-license"
-RDEPENDS:${PN}-bcm43436s += "${PN}-broadcom-license"
-RDEPENDS:${PN}-bcm43455 += "${PN}-broadcom-license"
-RDEPENDS:${PN}-bcm43456 += "${PN}-broadcom-license"
-RCONFLICTS:${PN}-bcm43430 = "\
- linux-firmware-bcm43430 \
- linux-firmware-raspbian-bcm43430 \
+FILES:${PN}-bcm43439 = " \
+ ${nonarch_base_libdir}/firmware/cypress/43439A0-7.95.49.00.combined \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43439-sdio* \
"
-
-RREPLACES:${PN}-bcm43430 = "\
- linux-firmware-bcm43430 \
- linux-firmware-raspbian-bcm43430 \
-"
-
-RCONFLICTS:${PN}-bcm43436 = "\
- linux-firmware-bcm43436 \
- linux-firmware-raspbian-bcm43436 \
-"
-
-RREPLACES:${PN}-bcm43436 = "\
- linux-firmware-bcm43436 \
- linux-firmware-raspbian-bcm43436 \
-"
-
-RCONFLICTS:${PN}-bcm43436s = "\
- linux-firmware-bcm43436s \
- linux-firmware-raspbian-bcm43436s \
-"
-
-RREPLACES:${PN}-bcm43436s = "\
- linux-firmware-bcm43436s \
- linux-firmware-raspbian-bcm43436s \
-"
-
-RCONFLICTS:${PN}-bcm43455 = "\
- linux-firmware-bcm43455 \
- linux-firmware-raspbian-bcm43455 \
-"
-RREPLACES:${PN}-bcm43455 = "\
- linux-firmware-bcm43455 \
- linux-firmware-raspbian-bcm43455 \
-"
-RCONFLICTS:${PN}-bcm43456 = "\
- linux-firmware-bcm43456 \
- linux-firmware-raspbian-bcm43456 \
-"
-RREPLACES:${PN}-bcm43456 = "\
- linux-firmware-bcm43456 \
- linux-firmware-raspbian-bcm43456 \
+FILES:${PN}-bcm43455 = " \
+ ${nonarch_base_libdir}/firmware/brcm/brcmfmac43455* \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43455-sdio* \
"
+FILES:${PN}-bcm43456 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43456*"
+FILES:${PN}-license = "${nonarch_base_libdir}/firmware/copyright.firmware-nonfree-rpidistro"
+
+RDEPENDS:${PN}-bcm43430 += "${PN}-license"
+RDEPENDS:${PN}-bcm43436 += "${PN}-license"
+RDEPENDS:${PN}-bcm43436s += "${PN}-license"
+RDEPENDS:${PN}-bcm43439 += "${PN}-license"
+RDEPENDS:${PN}-bcm43455 += "${PN}-license"
+RDEPENDS:${PN}-bcm43456 += "${PN}-license"
+
+RCONFLICTS:${PN}-bcm43430 = "linux-firmware-raspbian-bcm43430"
+RCONFLICTS:${PN}-bcm43436 = "linux-firmware-bcm43436"
+RCONFLICTS:${PN}-bcm43436s = "linux-firmware-bcm43436s"
+RCONFLICTS:${PN}-bcm43439 = "linux-firmware-bcm43439"
+RCONFLICTS:${PN}-bcm43455 = "linux-firmware-bcm43455"
+RCONFLICTS:${PN}-bcm43456 = "linux-firmware-bcm43456"
+
+RREPLACES:${PN}-bcm43430 = "linux-firmware-bcm43430"
+RREPLACES:${PN}-bcm43436 = "linux-firmware-bcm43436"
+RREPLACES:${PN}-bcm43436s = "linux-firmware-bcm43436s"
+RREPLACES:${PN}-bcm43439 = "linux-firmware-bcm43439"
+RREPLACES:${PN}-bcm43455 = "linux-firmware-bcm43455"
+RREPLACES:${PN}-bcm43456 = "linux-firmware-bcm43456"
# Firmware files are generally not run on the CPU, so they can be
# allarch despite being architecture specific
diff --git a/recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch b/recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch
deleted file mode 100644
index 66efde1..0000000
--- a/recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From 754e3030788702c1f013a88a4fc8546742d84e27 Mon Sep 17 00:00:00 2001
-From: Khem Raj <raj.khem@gmail.com>
-Date: Thu, 18 Jun 2020 13:45:04 -0700
-Subject: [PATCH] Revert "selftests/bpf: Skip perf hw events test if the setup
- disabled it"
-
-This reverts commit da43712a7262891317883d4b3a909fb18dac4b1d.
-
-Signed-off-by: Khem Raj <raj.khem@gmail.com>
----
- .../selftests/bpf/prog_tests/stacktrace_build_id_nmi.c | 8 ++------
- 1 file changed, 2 insertions(+), 6 deletions(-)
-
-diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
-index 437cb93e72ac..f62aa0eb959b 100644
---- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
-+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
-@@ -49,12 +49,8 @@ void test_stacktrace_build_id_nmi(void)
- pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
- 0 /* cpu 0 */, -1 /* group id */,
- 0 /* flags */);
-- if (pmu_fd < 0 && errno == ENOENT) {
-- printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__);
-- test__skip();
-- goto close_prog;
-- }
-- if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
-+ if (CHECK(pmu_fd < 0, "perf_event_open",
-+ "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
- pmu_fd, errno))
- goto close_prog;
-
---
-2.27.0
-
diff --git a/recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch b/recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch
new file mode 100644
index 0000000..4f64687
--- /dev/null
+++ b/recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch
@@ -0,0 +1,50 @@
+From 32f53700aeef2f5c7797ddda66348fc0b29e1047 Mon Sep 17 00:00:00 2001
+From: Kees Cook <keescook@chromium.org>
+Date: Wed, 18 Jan 2023 12:21:35 -0800
+Subject: [PATCH] gcc-plugins: Reorganize gimple includes for GCC 13
+
+The gimple-iterator.h header must be included before gimple-fold.h
+starting with GCC 13. Reorganize gimple headers to work for all GCC
+versions.
+
+Reported-by: Palmer Dabbelt <palmer@rivosinc.com>
+Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
+Link: https://lore.kernel.org/all/20230113173033.4380-1-palmer@rivosinc.com/
+Cc: linux-hardening@vger.kernel.org
+Signed-off-by: Kees Cook <keescook@chromium.org>
+---
+Upstream-Status: Pending
+
+ scripts/gcc-plugins/gcc-common.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
+index 0c087614fc3e..27770c31214c 100644
+--- a/scripts/gcc-plugins/gcc-common.h
++++ b/scripts/gcc-plugins/gcc-common.h
+@@ -77,8 +77,10 @@
+ #include "varasm.h"
+ #include "stor-layout.h"
+ #include "internal-fn.h"
++#include "gimple.h"
+ #include "gimple-expr.h"
+ #include "gimple-fold.h"
++#include "gimple-iterator.h"
+ #include "context.h"
+ #include "tree-ssa-alias.h"
+ #include "tree-ssa.h"
+@@ -91,11 +93,9 @@
+ #include "tree-eh.h"
+ #include "stmt.h"
+ #include "gimplify.h"
+-#include "gimple.h"
+ #include "tree-ssa-operands.h"
+ #include "tree-phinodes.h"
+ #include "tree-cfg.h"
+-#include "gimple-iterator.h"
+ #include "gimple-ssa.h"
+ #include "ssa-iterators.h"
+
+--
+2.39.1
+
diff --git a/recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch b/recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch
deleted file mode 100644
index d18b942..0000000
--- a/recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 366487b86a8c87954fb4ab7bd88ab49a929a32f6 Mon Sep 17 00:00:00 2001
-From: Khem Raj <raj.khem@gmail.com>
-Date: Mon, 13 Apr 2020 11:25:58 -0700
-Subject: [PATCH 2/2] Revert "selftests/bpf: Fix perf_buffer test on systems w/
- offline CPUs"
-
-This reverts commit 77bb53cb094828a31cd3c5b402899810f63073c1.
----
- .../selftests/bpf/prog_tests/perf_buffer.c | 29 ++++---------------
- 1 file changed, 5 insertions(+), 24 deletions(-)
-
-diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
-index cf6c87936c69..3003fddc0613 100644
---- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
-+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
-@@ -4,7 +4,6 @@
- #include <sched.h>
- #include <sys/socket.h>
- #include <test_progs.h>
--#include "libbpf_internal.h"
-
- static void on_sample(void *ctx, int cpu, void *data, __u32 size)
- {
-@@ -20,7 +19,7 @@ static void on_sample(void *ctx, int cpu, void *data, __u32 size)
-
- void test_perf_buffer(void)
- {
-- int err, prog_fd, on_len, nr_on_cpus = 0, nr_cpus, i, duration = 0;
-+ int err, prog_fd, nr_cpus, i, duration = 0;
- const char *prog_name = "kprobe/sys_nanosleep";
- const char *file = "./test_perf_buffer.o";
- struct perf_buffer_opts pb_opts = {};
-@@ -30,27 +29,15 @@ void test_perf_buffer(void)
- struct bpf_object *obj;
- struct perf_buffer *pb;
- struct bpf_link *link;
-- bool *online;
-
- nr_cpus = libbpf_num_possible_cpus();
- if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
- return;
-
-- err = parse_cpu_mask_file("/sys/devices/system/cpu/online",
-- &online, &on_len);
-- if (CHECK(err, "nr_on_cpus", "err %d\n", err))
-- return;
--
-- for (i = 0; i < on_len; i++)
-- if (online[i])
-- nr_on_cpus++;
--
- /* load program */
- err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
-- if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) {
-- obj = NULL;
-- goto out_close;
-- }
-+ if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
-+ return;
-
- prog = bpf_object__find_program_by_title(obj, prog_name);
- if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
-@@ -77,11 +64,6 @@ void test_perf_buffer(void)
- /* trigger kprobe on every CPU */
- CPU_ZERO(&cpu_seen);
- for (i = 0; i < nr_cpus; i++) {
-- if (i >= on_len || !online[i]) {
-- printf("skipping offline CPU #%d\n", i);
-- continue;
-- }
--
- CPU_ZERO(&cpu_set);
- CPU_SET(i, &cpu_set);
-
-@@ -99,8 +81,8 @@ void test_perf_buffer(void)
- if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
- goto out_free_pb;
-
-- if (CHECK(CPU_COUNT(&cpu_seen) != nr_on_cpus, "seen_cpu_cnt",
-- "expect %d, seen %d\n", nr_on_cpus, CPU_COUNT(&cpu_seen)))
-+ if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
-+ "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
- goto out_free_pb;
-
- out_free_pb:
-@@ -109,5 +91,4 @@ void test_perf_buffer(void)
- bpf_link__destroy(link);
- out_close:
- bpf_object__close(obj);
-- free(online);
- }
---
-2.26.0
-
diff --git a/recipes-kernel/linux/files/default-cpu-governor.cfg b/recipes-kernel/linux/files/default-cpu-governor.cfg
new file mode 100644
index 0000000..e2e201d
--- /dev/null
+++ b/recipes-kernel/linux/files/default-cpu-governor.cfg
@@ -0,0 +1,9 @@
+# The defconfigs from the RPi Kernel set "powersave" as the default CPU governor.
+# That is a bad idea as it reduces performance, so we unset that default option here.
+# The option to build the powersave governor (but not as the default) is also enabled.
+# A fix for this was sent to upstream: https://github.com/raspberrypi/linux/pull/5666
+# However, we need to carry this option override until those defconfigs are fixed on
+# *all* the kernel branches that we support. So that can be a long time depending
+# on wheter the above PR gets accepted and/or backported to the stable branches.
+CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE=n
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
diff --git a/recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg b/recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg
new file mode 100644
index 0000000..0248162
--- /dev/null
+++ b/recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg
@@ -0,0 +1 @@
+CONFIG_NVMEM_RMEM=y
diff --git a/recipes-kernel/linux/files/rpi.scc b/recipes-kernel/linux/files/rpi.scc
new file mode 100644
index 0000000..bb6fffd
--- /dev/null
+++ b/recipes-kernel/linux/files/rpi.scc
@@ -0,0 +1 @@
+patch 0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7.inc b/recipes-kernel/linux/linux-raspberrypi-v7.inc
new file mode 100644
index 0000000..77debc4
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7.inc
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+KBUILD_DEFCONFIG:raspberrypi-armv7 = "bcm2709_defconfig"
+KERNEL_PACKAGE_NAME = "${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+PROVIDES:remove = "virtual/kernel"
+
+KERNEL_IMAGETYPE_DIRECT ?= "zImage"
+
+COMPATIBLE_MACHINE = "^raspberrypi-armv7$"
+
+KERNEL_DEVICETREE = ""
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb b/recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb
new file mode 100644
index 0000000..7883985
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+require linux-raspberrypi-v7.inc
+require linux-raspberrypi_5.15.bb
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb b/recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb
new file mode 100644
index 0000000..ef77b0b
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+require linux-raspberrypi-v7.inc
+require linux-raspberrypi_6.1.bb
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb b/recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb
new file mode 100644
index 0000000..a5695f6
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+require linux-raspberrypi-v7.inc
+require linux-raspberrypi_6.6.bb
diff --git a/recipes-kernel/linux/linux-raspberrypi.inc b/recipes-kernel/linux/linux-raspberrypi.inc
index 6362a73..e62ff3f 100644
--- a/recipes-kernel/linux/linux-raspberrypi.inc
+++ b/recipes-kernel/linux/linux-raspberrypi.inc
@@ -3,7 +3,7 @@ SECTION = "kernel"
LICENSE = "GPL-2.0-only"
LIC_FILES_CHKSUM = "file://COPYING;md5=6bc538ed5bd9a7fc9398086aedcd7e46"
-COMPATIBLE_MACHINE = "^rpi$"
+COMPATIBLE_MACHINE ?= "^rpi$"
PE = "1"
PV = "${LINUX_VERSION}+git${SRCPV}"
@@ -15,8 +15,13 @@ SRC_URI += " \
${@bb.utils.contains("INITRAMFS_IMAGE_BUNDLE", "1", "file://initramfs-image-bundle.cfg", "", d)} \
${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "file://vc4graphics.cfg", "", d)} \
${@bb.utils.contains("MACHINE_FEATURES", "wm8960", "file://wm8960.cfg", "", d)} \
+ file://default-cpu-governor.cfg \
"
+SRC_URI:append:raspberrypi4 = " \
+ file://rpi4-nvmem.cfg \
+"
+
KCONFIG_MODE = "--alldefconfig"
KBUILD_DEFCONFIG:raspberrypi0-wifi ?= "bcmrpi_defconfig"
KBUILD_DEFCONFIG:raspberrypi ?= "bcmrpi_defconfig"
@@ -26,6 +31,9 @@ KBUILD_DEFCONFIG:raspberrypi3 ?= "bcm2709_defconfig"
KBUILD_DEFCONFIG:raspberrypi3-64 ?= "bcmrpi3_defconfig"
KBUILD_DEFCONFIG:raspberrypi4 ?= "bcm2711_defconfig"
KBUILD_DEFCONFIG:raspberrypi4-64 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi-armv7 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi-armv8 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi5 ?= "bcm2712_defconfig"
LINUX_VERSION_EXTENSION ?= ""
diff --git a/recipes-kernel/linux/linux-raspberrypi_5.10.bb b/recipes-kernel/linux/linux-raspberrypi_5.10.bb
deleted file mode 100644
index 8dade0b..0000000
--- a/recipes-kernel/linux/linux-raspberrypi_5.10.bb
+++ /dev/null
@@ -1,19 +0,0 @@
-LINUX_VERSION ?= "5.10.110"
-LINUX_RPI_BRANCH ?= "rpi-5.10.y"
-LINUX_RPI_KMETA_BRANCH ?= "yocto-5.10"
-
-SRCREV_machine = "89c0af71c9cf157a865afb526e9ebc21aadd531b"
-SRCREV_meta = "e1979ceb171bc91ef2cb71cfcde548a101dab687"
-
-KMETA = "kernel-meta"
-
-SRC_URI = " \
- git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
- git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
- file://powersave.cfg \
- file://android-drivers.cfg \
- "
-
-require linux-raspberrypi.inc
-
-KERNEL_DTC_FLAGS += "-@ -H epapr"
diff --git a/recipes-kernel/linux/linux-raspberrypi_5.15.bb b/recipes-kernel/linux/linux-raspberrypi_5.15.bb
index 77d4a98..3f167bb 100644
--- a/recipes-kernel/linux/linux-raspberrypi_5.15.bb
+++ b/recipes-kernel/linux/linux-raspberrypi_5.15.bb
@@ -1,15 +1,16 @@
-LINUX_VERSION ?= "5.15.34"
+LINUX_VERSION ?= "5.15.92"
LINUX_RPI_BRANCH ?= "rpi-5.15.y"
LINUX_RPI_KMETA_BRANCH ?= "yocto-5.15"
-SRCREV_machine = "0086da6acd41600d47b87b05874f99704216426f"
-SRCREV_meta = "e1b976ee4fb5af517cf01a9f2dd4a32f560ca894"
+SRCREV_machine = "14b35093ca68bf2c81bbc90aace5007142b40b40"
+SRCREV_meta = "509f4b9d68337f103633d48b621c1c9aa0dc975d"
KMETA = "kernel-meta"
SRC_URI = " \
git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
+ file://rpi.scc \
file://powersave.cfg \
file://android-drivers.cfg \
"
@@ -17,3 +18,15 @@ SRC_URI = " \
require linux-raspberrypi.inc
KERNEL_DTC_FLAGS += "-@ -H epapr"
+
+RDEPENDS:${KERNEL_PACKAGE_NAME}:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-base:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-base"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-image:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-image"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dev:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dev"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-vmlinux:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-vmlinux"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-modules:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-modules"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dbg:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dbg"
+
+DEPLOYDEP = ""
+DEPLOYDEP:raspberrypi-armv7 = "${RASPBERRYPI_v7_KERNEL}:do_deploy"
+do_deploy[depends] += "${DEPLOYDEP}"
diff --git a/recipes-kernel/linux/linux-raspberrypi_6.1.bb b/recipes-kernel/linux/linux-raspberrypi_6.1.bb
new file mode 100644
index 0000000..5731a81
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi_6.1.bb
@@ -0,0 +1,31 @@
+LINUX_VERSION ?= "6.1.77"
+LINUX_RPI_BRANCH ?= "rpi-6.1.y"
+LINUX_RPI_KMETA_BRANCH ?= "yocto-6.1"
+
+SRCREV_machine = "77fc1fbcb5c013329af9583307dd1ff3cd4752aa"
+SRCREV_meta = "43d1723dbe0ce7b341cf32feeb35ecbe6b0ce29a"
+
+KMETA = "kernel-meta"
+
+SRC_URI = " \
+ git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
+ git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
+ file://powersave.cfg \
+ file://android-drivers.cfg \
+ "
+
+require linux-raspberrypi.inc
+
+KERNEL_DTC_FLAGS += "-@ -H epapr"
+
+RDEPENDS:${KERNEL_PACKAGE_NAME}:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-base:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-base"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-image:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-image"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dev:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dev"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-vmlinux:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-vmlinux"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-modules:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-modules"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dbg:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dbg"
+
+DEPLOYDEP = ""
+DEPLOYDEP:raspberrypi-armv7 = "${RASPBERRYPI_v7_KERNEL}:do_deploy"
+do_deploy[depends] += "${DEPLOYDEP}"
diff --git a/recipes-kernel/linux/linux-raspberrypi_6.6.bb b/recipes-kernel/linux/linux-raspberrypi_6.6.bb
new file mode 100644
index 0000000..b4d9953
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi_6.6.bb
@@ -0,0 +1,31 @@
+LINUX_VERSION ?= "6.6.22"
+LINUX_RPI_BRANCH ?= "rpi-6.6.y"
+LINUX_RPI_KMETA_BRANCH ?= "yocto-6.6"
+
+SRCREV_machine = "c04af98514c26014a4f29ec87b3ece95626059bd"
+SRCREV_meta = "6a24861d6504575a4a9f92366285332d47c7e111"
+
+KMETA = "kernel-meta"
+
+SRC_URI = " \
+ git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
+ git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
+ file://powersave.cfg \
+ file://android-drivers.cfg \
+ "
+
+require linux-raspberrypi.inc
+
+KERNEL_DTC_FLAGS += "-@ -H epapr"
+
+RDEPENDS:${KERNEL_PACKAGE_NAME}:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-base:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-base"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-image:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-image"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dev:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dev"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-vmlinux:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-vmlinux"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-modules:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-modules"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dbg:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dbg"
+
+DEPLOYDEP = ""
+DEPLOYDEP:raspberrypi-armv7 = "${RASPBERRYPI_v7_KERNEL}:do_deploy"
+do_deploy[depends] += "${DEPLOYDEP}"
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch
index c8af7da..5e206e5 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch
@@ -27,6 +27,8 @@ arrive:
gst_omx_component_wait_message()
---
+Upstream-Status: Pending
+
omx/gstomxvideodec.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch
index 4342326..db443e6 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch
@@ -3,6 +3,8 @@ From: Khem Raj <raj.khem@gmail.com>
Date: Sat, 13 Feb 2016 11:42:29 -0800
---
+Upstream-Status: Pending
+
omx/gstomxvideodec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch
index 144ced6..c0ef99d 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch
@@ -7,6 +7,8 @@ Without this commit the decoder streaming thread stops without ever attending
the drain request, leaving the decoder input thread waiting forever.
---
+Upstream-Status: Pending
+
omx/gstomx.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch
index 3245294..9914bb8 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch
@@ -11,7 +11,7 @@ Date: Fri, 4 Dec 2015 18:39:59 +0100
Subject: [PATCH] Don't abort gst_omx_video_dec_set_format() if there's a
timeout releasing the buffers taken by the egl_render out port
-Upstream-status: Pending
+Upstream-Status: Pending
Signed-off-by: Andrei Gherzan <andrei@gherzan.ro>
---
omx/gstomxvideodec.c | 2 ++
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend
index 2bf6281..5b3f945 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend
@@ -1,2 +1,2 @@
PACKAGECONFIG:append:rpi = " hls \
- ${@bb.utils.contains('LICENSE_FLAGS_ACCEPTED', 'commercial', 'gpl faad', '', d)}"
+ ${@bb.utils.contains('LICENSE_FLAGS_ACCEPTED', 'commercial', 'faad', '', d)}"
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_1.20.%.bbappend b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_%.bbappend
index f3fb144..f3fb144 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_1.20.%.bbappend
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_%.bbappend
diff --git a/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch b/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch
index 37d0724..f65c421 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch
@@ -5,6 +5,8 @@ Subject: [PATCH] Fix build with vc4 driver
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
SubtitleRenderer.cpp | 7 ++++++-
SubtitleRenderer.h | 1 +
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch b/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch
index 82dfd3e..9e12bf3 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch
@@ -7,6 +7,8 @@ This helps in compiling with non-gcc compilers
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
Makefile.ffmpeg | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch b/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch
index f6abd7b..0dd8c62 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch
@@ -10,7 +10,7 @@ The following issues break offline builds:
* Makefile.ffmpeg explicitly does a "git clone" from the internet.
Signed-off-by: Paul Barker <pbarker@toganlabs.com>
-Upstream-status: Inappropriate
+Upstream-Status: Inappropriate
---
Makefile | 6 ++----
diff --git a/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch b/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch
index 890adde..81dab07 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch
@@ -7,7 +7,7 @@ Additional dependency information is needed in Makefile.ffmpeg to ensure that
the configure stage is finished before the compile stage starts.
Signed-off-by: Paul Barker <pbarker@toganlabs.com>
-Upstream-status: Pending
+Upstream-Status: Pending
---
Makefile.ffmpeg | 4 ++--
diff --git a/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch b/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch
index a8c51d5..02844db 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch
@@ -1,3 +1,5 @@
+Upstream-Status: Pending
+
--- a/Makefile 2019-06-20 15:04:53.390282996 +0200
+++ b/Makefile 2019-06-20 15:03:45.538763872 +0200
@@ -1,4 +1,4 @@
diff --git a/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch b/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch
index 20ed7c7..5d7e1e0 100644
--- a/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch
@@ -1,3 +1,5 @@
+Upstream-Status: Pending
+
Index: git/Makefile.ffmpeg
===================================================================
--- git.orig/Makefile.ffmpeg
diff --git a/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch b/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch
index e580470..e778561 100644
--- a/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch
@@ -9,6 +9,8 @@ to the default value which obviously is wrong.
Signed-off-by: Andrei Gherzan <andrei@gherzan.ro>
---
+Upstream-Status: Pending
+
Makefile.ffmpeg | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/recipes-multimedia/omxplayer/omxplayer_git.bb b/recipes-multimedia/omxplayer/omxplayer_git.bb
index c15b40b..b7eaf40 100644
--- a/recipes-multimedia/omxplayer/omxplayer_git.bb
+++ b/recipes-multimedia/omxplayer/omxplayer_git.bb
@@ -9,9 +9,11 @@ LIC_FILES_CHKSUM = "file://COPYING;md5=00a27da7ac0f9bcd17320ec29ef4bbf6"
DEPENDS = "alsa-lib libpcre virtual/egl boost freetype dbus openssl libssh virtual/libomxil coreutils-native curl-native userland"
-PR = "r5"
+PR = "r6"
-SRCREV_default = "f543a0d0e707ab56415f17b0ca6d397394ee8b63"
+SRCREV_FORMAT = "_ffmpeg"
+
+SRCREV_default = "1f1d0ccd65d3a1caa86dc79d2863a8f067c8e3f8"
# omxplayer builds its own copy of ffmpeg from source instead of using the
# system's ffmpeg library. This isn't ideal but it's ok for now. We do however
@@ -83,6 +85,8 @@ export INCLUDES = "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", " -D_
export DIST = "${D}"
do_compile() {
+ bbwarn "omxplayer is being deprecated and resources are directed at improving vlc."
+
# Needed for compiler test in ffmpeg's configure
mkdir -p tmp
diff --git a/recipes-multimedia/picamera-libs/picamera-libs.bb b/recipes-multimedia/picamera-libs/picamera-libs.bb
index 2ebe413..f873a19 100644
--- a/recipes-multimedia/picamera-libs/picamera-libs.bb
+++ b/recipes-multimedia/picamera-libs/picamera-libs.bb
@@ -6,7 +6,7 @@ LIC_FILES_CHKSUM = "file://opt/vc/LICENCE;md5=86e53f5f5909ee66900418028de11780"
include recipes-bsp/common/raspberrypi-firmware.inc
-S = "${WORKDIR}/raspberrypi-firmware-1.${PV}"
+S = "${RPIFW_S}"
do_install(){
install -m 0755 -d ${D}${libdir}
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
new file mode 100644
index 0000000..d9c07dd
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
@@ -0,0 +1,292 @@
+From: James Cowgill <jcowgill@debian.org>
+Date: Sun, 11 Aug 2019 16:50:56 +0100
+Subject: avcodec/arm/sbcenc: avoid callee preserved vfp registers
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+When compiling FFmpeg with GCC-9, some very random segfaults were
+observed in code which had previously called down into the SBC encoder
+NEON assembly routines. This was caused by these functions clobbering
+some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
+using these registers to save local variables, but after these
+functions returned, they would contain garbage.
+
+Fix by reallocating the registers in the two affected functions in
+the following way:
+ ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
+ ff_sbc_analyze_8_neon: q2-q9 => q8-q15
+
+The reason for using these replacements is to keep closely related
+sets of registers consecutively numbered which hopefully makes the
+code more easy to follow. Since this commit only reallocates
+registers, it should have no performance impact.
+
+Signed-off-by: James Cowgill <jcowgill@debian.org>
+---
+ libavcodec/arm/sbcdsp_neon.S | 220 +++++++++++++++++++++----------------------
+ 1 file changed, 110 insertions(+), 110 deletions(-)
+
+diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
+index d83d21d..914abfb 100644
+--- a/libavcodec/arm/sbcdsp_neon.S
++++ b/libavcodec/arm/sbcdsp_neon.S
+@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
+ /* TODO: merge even and odd cases (or even merge all four calls to this
+ * function) in order to have only aligned reads from 'in' array
+ * and reduce number of load instructions */
+- vld1.16 {d4, d5}, [r0, :64]!
+- vld1.16 {d8, d9}, [r2, :128]!
++ vld1.16 {d16, d17}, [r0, :64]!
++ vld1.16 {d20, d21}, [r2, :128]!
+
+- vmull.s16 q0, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmull.s16 q1, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
++ vmull.s16 q0, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmull.s16 q1, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
+
+- vmlal.s16 q0, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q1, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
++ vmlal.s16 q0, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q1, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
+
+- vmlal.s16 q0, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q1, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
++ vmlal.s16 q0, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q1, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
+
+- vmlal.s16 q0, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q1, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
++ vmlal.s16 q0, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q1, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
+
+- vmlal.s16 q0, d4, d8
+- vmlal.s16 q1, d5, d9
++ vmlal.s16 q0, d16, d20
++ vmlal.s16 q1, d17, d21
+
+ vpadd.s32 d0, d0, d1
+ vpadd.s32 d1, d2, d3
+
+ vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE
+
+- vld1.16 {d2, d3, d4, d5}, [r2, :128]!
++ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+
+ vdup.i32 d1, d0[1] /* TODO: can be eliminated */
+ vdup.i32 d0, d0[0] /* TODO: can be eliminated */
+
+- vmull.s16 q3, d2, d0
+- vmull.s16 q4, d3, d0
+- vmlal.s16 q3, d4, d1
+- vmlal.s16 q4, d5, d1
++ vmull.s16 q10, d16, d0
++ vmull.s16 q11, d17, d0
++ vmlal.s16 q10, d18, d1
++ vmlal.s16 q11, d19, d1
+
+- vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */
+- vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */
++ vpadd.s32 d0, d20, d21 /* TODO: can be eliminated */
++ vpadd.s32 d1, d22, d23 /* TODO: can be eliminated */
+
+ vst1.32 {d0, d1}, [r1, :128]
+
+@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
+ /* TODO: merge even and odd cases (or even merge all four calls to this
+ * function) in order to have only aligned reads from 'in' array
+ * and reduce number of load instructions */
+- vld1.16 {d4, d5}, [r0, :64]!
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmull.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmull.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmull.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmull.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmlal.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmlal.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmlal.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+-
+- vmlal.s16 q8, d6, d10
+- vmlal.s16 q9, d7, d11
+-
+- vpadd.s32 d0, d12, d13
+- vpadd.s32 d1, d14, d15
+- vpadd.s32 d2, d16, d17
+- vpadd.s32 d3, d18, d19
++ vld1.16 {d16, d17}, [r0, :64]!
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmull.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmull.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmull.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmull.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmlal.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmlal.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmlal.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++
++ vmlal.s16 q14, d18, d22
++ vmlal.s16 q15, d19, d23
++
++ vpadd.s32 d0, d24, d25
++ vpadd.s32 d1, d26, d27
++ vpadd.s32 d2, d28, d29
++ vpadd.s32 d3, d30, d31
+
+ vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE
+ vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE
+@@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1
+ vdup.i32 d1, d0[1] /* TODO: can be eliminated */
+ vdup.i32 d0, d0[0] /* TODO: can be eliminated */
+
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmull.s16 q6, d4, d0
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmull.s16 q7, d5, d0
+- vmull.s16 q8, d6, d0
+- vmull.s16 q9, d7, d0
+-
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmlal.s16 q6, d4, d1
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmlal.s16 q7, d5, d1
+- vmlal.s16 q8, d6, d1
+- vmlal.s16 q9, d7, d1
+-
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmlal.s16 q6, d4, d2
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmlal.s16 q7, d5, d2
+- vmlal.s16 q8, d6, d2
+- vmlal.s16 q9, d7, d2
+-
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmlal.s16 q6, d4, d3
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmlal.s16 q7, d5, d3
+- vmlal.s16 q8, d6, d3
+- vmlal.s16 q9, d7, d3
+-
+- vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */
+- vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */
+- vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */
+- vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmull.s16 q12, d16, d0
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmull.s16 q13, d17, d0
++ vmull.s16 q14, d18, d0
++ vmull.s16 q15, d19, d0
++
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmlal.s16 q12, d16, d1
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmlal.s16 q13, d17, d1
++ vmlal.s16 q14, d18, d1
++ vmlal.s16 q15, d19, d1
++
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmlal.s16 q12, d16, d2
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmlal.s16 q13, d17, d2
++ vmlal.s16 q14, d18, d2
++ vmlal.s16 q15, d19, d2
++
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmlal.s16 q12, d16, d3
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmlal.s16 q13, d17, d3
++ vmlal.s16 q14, d18, d3
++ vmlal.s16 q15, d19, d3
++
++ vpadd.s32 d0, d24, d25 /* TODO: can be eliminated */
++ vpadd.s32 d1, d26, d27 /* TODO: can be eliminated */
++ vpadd.s32 d2, d28, d29 /* TODO: can be eliminated */
++ vpadd.s32 d3, d30, d31 /* TODO: can be eliminated */
+
+ vst1.32 {d0, d1, d2, d3}, [r1, :128]
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch
new file mode 100644
index 0000000..f398791
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch
@@ -0,0 +1,34 @@
+From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
+Date: Tue, 19 Jan 2021 20:35:29 +0100
+Subject: Fix build on powerpc and ppc64
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+---
+ libswscale/ppc/yuv2rgb_altivec.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
+index 5365452..930ef6b 100644
+--- a/libswscale/ppc/yuv2rgb_altivec.c
++++ b/libswscale/ppc/yuv2rgb_altivec.c
+@@ -283,6 +283,16 @@ static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
+ * ------------------------------------------------------------------------------
+ */
+
++#if !HAVE_VSX
++static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr)
++{
++ const vector unsigned char *v_addr = (const vector unsigned char *) (addr + offset);
++ vector unsigned char align_perm = vec_lvsl(offset, addr);
++
++ return (vector unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
++}
++#endif /* !HAVE_VSX */
++
+ #define DEFCSP420_CVT(name, out_pixels) \
+ static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
+ int *instrides, int srcSliceY, int srcSliceH, \
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
new file mode 100644
index 0000000..11e3383
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
@@ -0,0 +1,30 @@
+From: Paul B Mahol <onemda@gmail.com>
+Date: Sun, 14 Feb 2021 17:20:03 +0100
+Subject: avcodec/pngenc: remove monowhite from apng formats
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Monowhite pixel format is not supported, and it does not make sense
+to add support for it.
+
+Fixes #7989
+---
+ libavcodec/pngenc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c
+index efcae8c..eebb164 100644
+--- a/libavcodec/pngenc.c
++++ b/libavcodec/pngenc.c
+@@ -1174,7 +1174,7 @@ AVCodec ff_apng_encoder = {
+ AV_PIX_FMT_PAL8,
+ AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+ AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+- AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
++ AV_PIX_FMT_NONE
+ },
+ .priv_class = &apngenc_class,
+ };
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch
new file mode 100644
index 0000000..740ac0e
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch
@@ -0,0 +1,68341 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+--- a/configure
++++ b/configure
+@@ -207,6 +207,7 @@ External library support:
+ --disable-bzlib disable bzlib [autodetect]
+ --disable-coreimage disable Apple CoreImage framework [autodetect]
+ --enable-chromaprint enable audio fingerprinting with chromaprint [no]
++ --disable-epoxy disable epoxy [autodetect]
+ --enable-frei0r enable frei0r video filtering [no]
+ --enable-gcrypt enable gcrypt, needed for rtmp(t)e support
+ if openssl, librtmp or gmp is not used [no]
+@@ -274,6 +275,7 @@ External library support:
+ --enable-libtls enable LibreSSL (via libtls), needed for https support
+ if openssl, gnutls or mbedtls is not used [no]
+ --enable-libtwolame enable MP2 encoding via libtwolame [no]
++ --disable-libudev disable libudev [autodetect]
+ --enable-libv4l2 enable libv4l2/v4l-utils [no]
+ --enable-libvidstab enable video stabilization using vid.stab [no]
+ --enable-libvmaf enable vmaf filter via libvmaf [no]
+@@ -336,12 +338,17 @@ External library support:
+ --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
+ --enable-libnpp enable Nvidia Performance Primitives-based code [no]
+ --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
++ --enable-rpi enable other rpi specific stuff [no]
++ --enable-sand enable sand video formats [rpi]
++ --enable-vout-drm enable the vout_drm module - for internal testing only [no]
++ --enable-vout-egl enable the vout_egl module - for internal testing only [no]
+ --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
+ --disable-nvenc disable Nvidia video encoding code [autodetect]
+ --enable-omx enable OpenMAX IL code [no]
+ --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no]
+ --enable-rkmpp enable Rockchip Media Process Platform code [no]
+ --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect]
++ --enable-v4l2-request enable V4L2 request API code [no]
+ --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
+ --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
+ --disable-videotoolbox disable VideoToolbox code [autodetect]
+@@ -1699,7 +1706,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
+ avfoundation
+ bzlib
+ coreimage
++ epoxy
+ iconv
++ libudev
+ libxcb
+ libxcb_shm
+ libxcb_shape
+@@ -1861,7 +1870,10 @@ HWACCEL_LIBRARY_LIST="
+ mmal
+ omx
+ opencl
++ v4l2_request
+ vulkan
++ rpi4_8
++ rpi4_10
+ "
+
+ DOCUMENT_LIST="
+@@ -1877,12 +1889,16 @@ FEATURE_LIST="
+ gray
+ hardcoded_tables
+ omx_rpi
++ rpi
+ runtime_cpudetect
+ safe_bitstream_reader
++ sand
+ shared
+ small
+ static
+ swscale_alpha
++ vout_drm
++ vout_egl
+ "
+
+ # this list should be kept in linking order
+@@ -1923,6 +1939,7 @@ SUBSYSTEM_LIST="
+ pixelutils
+ network
+ rdft
++ rpi
+ "
+
+ # COMPONENT_LIST needs to come last to ensure correct dependency checking
+@@ -2405,9 +2422,11 @@ CONFIG_EXTRA="
+ rangecoder
+ riffdec
+ riffenc
++ rpi
+ rtpdec
+ rtpenc_chain
+ rv34dsp
++ sand
+ scene_sad
+ sinewin
+ snappy
+@@ -2737,6 +2756,8 @@ hap_decoder_select="snappy texturedsp"
+ hap_encoder_deps="libsnappy"
+ hap_encoder_select="texturedspenc"
+ hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
++hevc_rpi_decoder_deps="rpi"
++hevc_rpi_decoder_select="hevc_decoder sand"
+ huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
+ huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
+ hymt_decoder_select="huffyuv_decoder"
+@@ -2903,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
+ dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
+ ffnvcodec_deps_any="libdl LoadLibrary"
+ nvdec_deps="ffnvcodec"
++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
+ vaapi_x11_deps="xlib"
+ videotoolbox_hwaccel_deps="videotoolbox pthreads"
+ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
+@@ -2934,6 +2956,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
+ hevc_dxva2_hwaccel_select="hevc_decoder"
+ hevc_nvdec_hwaccel_deps="nvdec"
+ hevc_nvdec_hwaccel_select="hevc_decoder"
++hevc_v4l2request_hwaccel_deps="v4l2_request"
++hevc_v4l2request_hwaccel_select="hevc_decoder"
++hevc_rpi4_10_hwaccel_deps="rpi"
++hevc_rpi4_10_hwaccel_select="hevc_decoder"
++hevc_rpi4_8_hwaccel_deps="rpi"
++hevc_rpi4_8_hwaccel_select="hevc_decoder"
+ hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
+ hevc_vaapi_hwaccel_select="hevc_decoder"
+ hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
+@@ -3401,8 +3429,13 @@ sndio_indev_deps="sndio"
+ sndio_outdev_deps="sndio"
+ v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_indev_suggest="libv4l2"
++v4l2_outdev_deps="libdrm"
+ v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_outdev_suggest="libv4l2"
++vout_drm_outdev_deps="libdrm"
++vout_egl_outdev_deps="xlib epoxy"
++vout_rpi_outdev_deps="rpi"
++vout_rpi_outdev_select="sand"
+ vfwcap_indev_deps="vfw32 vfwcap_defines"
+ xcbgrab_indev_deps="libxcb"
+ xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
+@@ -3618,6 +3651,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcF
+ tonemap_opencl_filter_deps="opencl const_nan"
+ transpose_opencl_filter_deps="opencl"
+ transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
++unsand_filter_select="sand"
+ unsharp_opencl_filter_deps="opencl"
+ uspp_filter_deps="gpl avcodec"
+ vaguedenoiser_filter_deps="gpl"
+@@ -6102,6 +6136,12 @@ check_func_headers glob.h glob
+ enabled xlib &&
+ check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext
+
++enabled libudev &&
++ check_pkg_config libudev libudev libudev.h udev_new
++
++enabled epoxy &&
++ check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
++
+ check_headers direct.h
+ check_headers dirent.h
+ check_headers dxgidebug.h
+@@ -6430,11 +6470,12 @@ enabled mbedtls && { check_pkg
+ check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
+ die "ERROR: mbedTLS not found"; }
+ enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+-enabled mmal && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
++( enabled rpi ||
++ enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+ { ! enabled cross_compile &&
+ add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+ add_ldflags -L/opt/vc/lib/ &&
+- check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
++ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
+ die "ERROR: mmal not found" &&
+ check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+ enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
+@@ -6475,8 +6516,16 @@ enabled rkmpp && { require_p
+ { enabled libdrm ||
+ die "ERROR: rkmpp requires --enable-libdrm"; }
+ }
++enabled v4l2_request && { enabled libdrm ||
++ die "ERROR: v4l2-request requires --enable-libdrm"; } &&
++ { enabled libudev ||
++ die "ERROR: v4l2-request requires libudev"; }
+ enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
+
++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
++
++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
++ { enabled xlib || die "ERROR: vout_egl requires xlib"; }
+
+ if enabled gcrypt; then
+ GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
+@@ -6556,6 +6605,8 @@ if enabled v4l2_m2m; then
+ check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
+ fi
+
++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+
+--- a/fftools/ffmpeg.c
++++ b/fftools/ffmpeg.c
+@@ -2119,8 +2119,8 @@ static int ifilter_send_frame(InputFilte
+ ifilter->channel_layout != frame->channel_layout;
+ break;
+ case AVMEDIA_TYPE_VIDEO:
+- need_reinit |= ifilter->width != frame->width ||
+- ifilter->height != frame->height;
++ need_reinit |= ifilter->width != av_frame_cropped_width(frame) ||
++ ifilter->height != av_frame_cropped_height(frame);
+ break;
+ }
+
+@@ -2131,6 +2131,9 @@ static int ifilter_send_frame(InputFilte
+ (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
+ need_reinit = 1;
+
++ if (no_cvt_hw && fg->graph)
++ need_reinit = 0;
++
+ if (need_reinit) {
+ ret = ifilter_parameters_from_frame(ifilter, frame);
+ if (ret < 0)
+@@ -2401,8 +2404,7 @@ static int decode_video(InputStream *ist
+ decoded_frame->top_field_first = ist->top_field_first;
+
+ ist->frames_decoded++;
+-
+- if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
++ if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+ err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
+ if (err < 0)
+ goto fail;
+@@ -2600,7 +2602,12 @@ static int process_input_packet(InputStr
+ case AVMEDIA_TYPE_VIDEO:
+ ret = decode_video (ist, repeating ? NULL : &avpkt, &got_output, &duration_pts, !pkt,
+ &decode_failed);
+- if (!repeating || !pkt || got_output) {
++ // Pi: Do not inc dts if no_cvt_hw set
++ // V4L2 H264 decode has long latency and sometimes spits out a long
++ // stream of output without input. In this case incrementing DTS is wrong.
++ // There may be cases where the condition as written is correct so only
++ // "fix" in the cases which cause problems
++ if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
+ if (pkt && pkt->duration) {
+ duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
+ } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
+@@ -2820,6 +2827,16 @@ static enum AVPixelFormat get_format(AVC
+ } else {
+ const HWAccel *hwaccel = NULL;
+ int i;
++
++ if (no_cvt_hw) {
++ config = avcodec_get_hw_config(s->codec, 0);
++ if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
++ av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p);
++ ist->hwaccel_pix_fmt = *p;
++ break;
++ }
++ }
++
+ for (i = 0; hwaccels[i].name; i++) {
+ if (hwaccels[i].pix_fmt == *p) {
+ hwaccel = &hwaccels[i];
+@@ -2914,6 +2931,15 @@ static int init_input_stream(int ist_ind
+ return ret;
+ }
+
++#if CONFIG_HEVC_RPI_DECODER
++ ret = -1;
++ if (strcmp(codec->name, "hevc_rpi") == 0 &&
++ (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
++ ist->dec = codec = avcodec_find_decoder_by_name("hevc");
++ av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
++ }
++ if (ret < 0)
++#endif
+ if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
+ if (ret == AVERROR_EXPERIMENTAL)
+ abort_codec_experimental(codec, 0);
+--- a/fftools/ffmpeg.h
++++ b/fftools/ffmpeg.h
+@@ -61,6 +61,7 @@ enum HWAccelID {
+ HWACCEL_GENERIC,
+ HWACCEL_VIDEOTOOLBOX,
+ HWACCEL_QSV,
++ HWACCEL_RPI,
+ };
+
+ typedef struct HWAccel {
+@@ -590,6 +591,7 @@ extern int video_sync_method;
+ extern float frame_drop_threshold;
+ extern int do_benchmark;
+ extern int do_benchmark_all;
++extern int no_cvt_hw;
+ extern int do_deinterlace;
+ extern int do_hex_dump;
+ extern int do_pkt_dump;
+--- a/fftools/ffmpeg_filter.c
++++ b/fftools/ffmpeg_filter.c
+@@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputF
+
+ ifilter->format = frame->format;
+
+- ifilter->width = frame->width;
+- ifilter->height = frame->height;
++ ifilter->width = av_frame_cropped_width(frame);
++ ifilter->height = av_frame_cropped_height(frame);
+ ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
+
+ ifilter->sample_rate = frame->sample_rate;
+--- a/fftools/ffmpeg_hw.c
++++ b/fftools/ffmpeg_hw.c
+@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum
+ char *name;
+ size_t index_pos;
+ int index, index_limit = 1000;
++ if (!type_name)
++ return NULL;
+ index_pos = strlen(type_name);
+ name = av_malloc(index_pos + 4);
+ if (!name)
+--- a/fftools/ffmpeg_opt.c
++++ b/fftools/ffmpeg_opt.c
+@@ -130,6 +130,12 @@ static const char *opt_name_enc_time_bas
+ }\
+ }
+
++#if CONFIG_RPI
++static int rpi_init(AVCodecContext *avctx) {
++ return 0;
++}
++#endif
++
+ const HWAccel hwaccels[] = {
+ #if CONFIG_VIDEOTOOLBOX
+ { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
+@@ -137,6 +143,10 @@ const HWAccel hwaccels[] = {
+ #if CONFIG_LIBMFX
+ { "qsv", qsv_init, HWACCEL_QSV, AV_PIX_FMT_QSV },
+ #endif
++#if CONFIG_RPI
++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
++#endif
+ { 0 },
+ };
+ HWDevice *filter_hw_device;
+@@ -155,6 +165,7 @@ float frame_drop_threshold = 0;
+ int do_deinterlace = 0;
+ int do_benchmark = 0;
+ int do_benchmark_all = 0;
++int no_cvt_hw = 0;
+ int do_hex_dump = 0;
+ int do_pkt_dump = 0;
+ int copy_ts = 0;
+@@ -3460,6 +3471,8 @@ const OptionDef options[] = {
+ "add timings for benchmarking" },
+ { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all },
+ "add timings for each task" },
++ { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw },
++ "do not auto-convert hw frames to sw" },
+ { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress },
+ "write program-readable progress information", "url" },
+ { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction },
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h
+ mediacodec.h \
+ packet.h \
+ qsv.h \
++ rpi_zc.h \
+ vaapi.h \
+ vdpau.h \
+ version.h \
+@@ -138,6 +139,7 @@ OBJS-$(CONFIG_QSVDEC) +
+ OBJS-$(CONFIG_QSVENC) += qsvenc.o
+ OBJS-$(CONFIG_RANGECODER) += rangecoder.o
+ OBJS-$(CONFIG_RDFT) += rdft.o
++OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o
+ OBJS-$(CONFIG_RV34DSP) += rv34dsp.o
+ OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o
+ OBJS-$(CONFIG_SINEWIN) += sinewin.o sinewin_fixed.o
+@@ -152,7 +154,10 @@ OBJS-$(CONFIG_VIDEODSP) +
+ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o
+ OBJS-$(CONFIG_VP56DSP) += vp56dsp.o
+ OBJS-$(CONFIG_VP8DSP) += vp8dsp.o
+-OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
++OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
++ weak_link.o
++OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
++ v4l2_req_devscan.o weak_link.o
+ OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o
+ OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o
+
+@@ -391,6 +396,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER) +
+ OBJS-$(CONFIG_HEVC_QSV_ENCODER) += qsvenc_hevc.o hevc_ps_enc.o \
+ hevc_data.o
+ OBJS-$(CONFIG_HEVC_RKMPP_DECODER) += rkmppdec.o
++OBJS-$(CONFIG_RPI) += rpi_mem.o \
++ rpi_mailbox.o rpi_zc.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \
++ rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \
++ rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \
++ rpi_hevc_shader.o rpi_hevc_shader_template.o \
++ rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
++ rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
+ OBJS-$(CONFIG_HEVC_VAAPI_ENCODER) += vaapi_encode_h265.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER) += v4l2_m2m_dec.o
+ OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER) += v4l2_m2m_enc.o
+@@ -909,6 +922,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)
+ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o
+ OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec_h2645.o
++OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL) += rpivid_hevc.o
++OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL) += rpivid_hevc.o
++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\
++ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o v4l2_req_hevc_v4.o
+ OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o
+ OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o
+@@ -1261,3 +1278,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+ endif
++
++ifdef CONFIG_HEVC_RPI_DECODER
++QASM_PY := ../local/bin/qasm.py
++VASMVIDCORE := ../local/bin/vasmvidcore_std
++
++ifneq ("$(wildcard $(QASM_PY))","")
++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
++ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++
++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
++ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++endif
++
++ifneq ("$(wildcard $(VASMVIDCORE))","")
++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
++
++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
++ python pi-util/make_array.py $<
++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
++ python pi-util/make_array.py $<
++endif
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
++endif
+--- a/libavcodec/aarch64/Makefile
++++ b/libavcodec/aarch64/Makefile
+@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)
+ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
+ aarch64/hpeldsp_neon.o
+ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
+-NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
++NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
++ aarch64/simple_idct_neon.o
+ NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+ NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
+ NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
++NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o
+ NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
+
+ # decoders/encoders
+--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
+@@ -27,19 +27,29 @@
+ #include "libavcodec/idctdsp.h"
+ #include "idct.h"
+
++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++
+ av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+ {
+ int cpu_flags = av_get_cpu_flags();
+
+- if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
+- if (avctx->idct_algo == FF_IDCT_AUTO ||
+- avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+- avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+- c->idct_put = ff_simple_idct_put_neon;
+- c->idct_add = ff_simple_idct_add_neon;
+- c->idct = ff_simple_idct_neon;
+- c->perm_type = FF_IDCT_PERM_PARTTRANS;
++ if (have_neon(cpu_flags)) {
++ if (!avctx->lowres && !high_bit_depth) {
++ if (avctx->idct_algo == FF_IDCT_AUTO ||
++ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
++ avctx->idct_algo == FF_IDCT_SIMPLENEON) {
++ c->idct_put = ff_simple_idct_put_neon;
++ c->idct_add = ff_simple_idct_add_neon;
++ c->idct = ff_simple_idct_neon;
++ c->perm_type = FF_IDCT_PERM_PARTTRANS;
++ }
+ }
++
++ c->add_pixels_clamped = ff_add_pixels_clamped_neon;
++ c->put_pixels_clamped = ff_put_pixels_clamped_neon;
++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+ }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/idctdsp_neon.S
+@@ -0,0 +1,130 @@
++/*
++ * IDCT AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// Clamp 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++// x0 -> array of 64x 16-bit coefficients
++// x1 -> 8-bit results
++// x2 = row stride for results, bytes
++function ff_put_pixels_clamped_neon, export=1
++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ sqxtun v4.8b, v4.8h
++ st1 {v0.8b}, [x1], x2
++ sqxtun v0.8b, v5.8h
++ st1 {v1.8b}, [x1], x2
++ sqxtun v1.8b, v6.8h
++ st1 {v2.8b}, [x1], x2
++ sqxtun v2.8b, v7.8h
++ st1 {v3.8b}, [x1], x2
++ st1 {v4.8b}, [x1], x2
++ st1 {v0.8b}, [x1], x2
++ st1 {v1.8b}, [x1], x2
++ st1 {v2.8b}, [x1]
++ ret
++endfunc
++
++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
++// On entry:
++// x0 -> array of 64x 16-bit coefficients
++// x1 -> 8-bit results
++// x2 = row stride for results, bytes
++function ff_put_signed_pixels_clamped_neon, export=1
++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++ movi v4.8b, #128
++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++ sqxtn v0.8b, v0.8h
++ sqxtn v1.8b, v1.8h
++ sqxtn v2.8b, v2.8h
++ sqxtn v3.8b, v3.8h
++ sqxtn v5.8b, v16.8h
++ add v0.8b, v0.8b, v4.8b
++ sqxtn v6.8b, v17.8h
++ add v1.8b, v1.8b, v4.8b
++ sqxtn v7.8b, v18.8h
++ add v2.8b, v2.8b, v4.8b
++ sqxtn v16.8b, v19.8h
++ add v3.8b, v3.8b, v4.8b
++ st1 {v0.8b}, [x1], x2
++ add v0.8b, v5.8b, v4.8b
++ st1 {v1.8b}, [x1], x2
++ add v1.8b, v6.8b, v4.8b
++ st1 {v2.8b}, [x1], x2
++ add v2.8b, v7.8b, v4.8b
++ st1 {v3.8b}, [x1], x2
++ add v3.8b, v16.8b, v4.8b
++ st1 {v0.8b}, [x1], x2
++ st1 {v1.8b}, [x1], x2
++ st1 {v2.8b}, [x1], x2
++ st1 {v3.8b}, [x1]
++ ret
++endfunc
++
++// Add 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++// x0 -> array of 64x 16-bit coefficients
++// x1 -> 8-bit input and results
++// x2 = row stride for 8-bit input and results, bytes
++function ff_add_pixels_clamped_neon, export=1
++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++ mov x3, x1
++ ld1 {v4.8b}, [x1], x2
++ ld1 {v5.8b}, [x1], x2
++ ld1 {v6.8b}, [x1], x2
++ ld1 {v7.8b}, [x1], x2
++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++ uaddw v0.8h, v0.8h, v4.8b
++ uaddw v1.8h, v1.8h, v5.8b
++ uaddw v2.8h, v2.8h, v6.8b
++ ld1 {v4.8b}, [x1], x2
++ uaddw v3.8h, v3.8h, v7.8b
++ ld1 {v5.8b}, [x1], x2
++ sqxtun v0.8b, v0.8h
++ ld1 {v6.8b}, [x1], x2
++ sqxtun v1.8b, v1.8h
++ ld1 {v7.8b}, [x1]
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ uaddw v4.8h, v16.8h, v4.8b
++ st1 {v0.8b}, [x3], x2
++ uaddw v0.8h, v17.8h, v5.8b
++ st1 {v1.8b}, [x3], x2
++ uaddw v1.8h, v18.8h, v6.8b
++ st1 {v2.8b}, [x3], x2
++ uaddw v2.8h, v19.8h, v7.8b
++ sqxtun v4.8b, v4.8h
++ sqxtun v0.8b, v0.8h
++ st1 {v3.8b}, [x3], x2
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ st1 {v4.8b}, [x3], x2
++ st1 {v0.8b}, [x3], x2
++ st1 {v1.8b}, [x3], x2
++ st1 {v2.8b}, [x3]
++ ret
++endfunc
+--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+@@ -21,10 +21,28 @@
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+
+ #include "config.h"
+
++void ff_vc1_inv_trans_8x8_neon(int16_t *block);
++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++ /* Dealing with starting and stopping, and removing escape bytes, are
++ * comparatively less time-sensitive, so are more clearly expressed using
++ * a C wrapper around the assembly inner loop. Note that we assume a
++ * little-endian machine that supports unaligned loads. */
++ int dsize = 0;
++ while (size >= 4)
++ {
++ int found = 0;
++ while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ if (!found)
++ {
++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++ dst += skip;
++ src += skip;
++ size -= skip;
++ dsize += skip;
++ while (!found && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ }
++ if (found)
++ {
++ *dst++ = *src++;
++ *dst++ = *src++;
++ ++src;
++ size -= 3;
++ dsize += 2;
++ }
++ }
++ while (size > 0)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ return dsize;
++}
++
+ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+ {
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
++ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
++ dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
++ dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
++ dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
++ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
++ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
++ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
++ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
++
++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon;
++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon;
++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon;
++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon;
++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/vc1dsp_neon.S
+@@ -0,0 +1,1546 @@
++/*
++ * VC1 AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// VC-1 8x8 inverse transform
++// On entry:
++// x0 -> array of 16-bit inverse transform coefficients, in column-major order
++// On exit:
++// array at x0 updated to hold transformed block; also now held in row-major order
++function ff_vc1_inv_trans_8x8_neon, export=1
++ ld1 {v1.16b, v2.16b}, [x0], #32
++ ld1 {v3.16b, v4.16b}, [x0], #32
++ ld1 {v5.16b, v6.16b}, [x0], #32
++ shl v1.8h, v1.8h, #2 // 8/2 * src[0]
++ sub x1, x0, #3*32
++ ld1 {v16.16b, v17.16b}, [x0]
++ shl v7.8h, v2.8h, #4 // 16 * src[8]
++ shl v18.8h, v2.8h, #2 // 4 * src[8]
++ shl v19.8h, v4.8h, #4 // 16 * src[24]
++ ldr d0, .Lcoeffs_it8
++ shl v5.8h, v5.8h, #2 // 8/2 * src[32]
++ shl v20.8h, v6.8h, #4 // 16 * src[40]
++ shl v21.8h, v6.8h, #2 // 4 * src[40]
++ shl v22.8h, v17.8h, #4 // 16 * src[56]
++ ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40]
++ mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16]
++ sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40]
++ ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56]
++ sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56]
++ shl v3.8h, v3.8h, #3 // 16/2 * src[16]
++ mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
++ ssra v1.8h, v1.8h, #1 // 12/2 * src[0]
++ ssra v5.8h, v5.8h, #1 // 12/2 * src[32]
++ mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
++ shl v21.8h, v16.8h, #3 // 16/2 * src[48]
++ mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
++ sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
++ mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
++ add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
++ sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
++ mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
++ mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
++ add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
++ sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
++ mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
++ add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
++ add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
++ mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
++ sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
++ add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
++ mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
++ sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
++ sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
++ neg v3.8h, v7.8h // -t1
++ neg v4.8h, v20.8h // +t2
++ neg v6.8h, v19.8h // +t3
++ ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1
++ ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1
++ neg v7.8h, v18.8h // +t4
++ ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1
++ ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1
++ ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1
++ ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1
++ ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1
++ ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1
++ srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3
++ srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3
++ srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3
++ srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3
++ srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3
++ srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3
++ srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3
++ srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3
++ trn2 v17.8h, v3.8h, v4.8h
++ trn2 v18.8h, v5.8h, v6.8h
++ trn2 v19.8h, v2.8h, v1.8h
++ trn2 v20.8h, v7.8h, v16.8h
++ trn1 v21.4s, v17.4s, v18.4s
++ trn2 v17.4s, v17.4s, v18.4s
++ trn1 v18.4s, v19.4s, v20.4s
++ trn2 v19.4s, v19.4s, v20.4s
++ trn1 v3.8h, v3.8h, v4.8h
++ trn2 v4.2d, v21.2d, v18.2d
++ trn1 v20.2d, v17.2d, v19.2d
++ trn1 v5.8h, v5.8h, v6.8h
++ trn1 v1.8h, v2.8h, v1.8h
++ trn1 v2.8h, v7.8h, v16.8h
++ trn1 v6.2d, v21.2d, v18.2d
++ trn2 v7.2d, v17.2d, v19.2d
++ shl v16.8h, v20.8h, #4 // 16 * src[24]
++ shl v17.8h, v4.8h, #4 // 16 * src[40]
++ trn1 v18.4s, v3.4s, v5.4s
++ trn1 v19.4s, v1.4s, v2.4s
++ shl v21.8h, v7.8h, #4 // 16 * src[56]
++ shl v22.8h, v6.8h, #2 // 4 * src[8]
++ shl v23.8h, v4.8h, #2 // 4 * src[40]
++ trn2 v3.4s, v3.4s, v5.4s
++ trn2 v1.4s, v1.4s, v2.4s
++ shl v2.8h, v6.8h, #4 // 16 * src[8]
++ sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40]
++ ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40]
++ sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56]
++ trn1 v22.2d, v18.2d, v19.2d
++ trn2 v18.2d, v18.2d, v19.2d
++ trn1 v19.2d, v3.2d, v1.2d
++ ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56]
++ mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
++ shl v21.8h, v22.8h, #2 // 8/2 * src[0]
++ shl v18.8h, v18.8h, #2 // 8/2 * src[32]
++ mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
++ shl v6.8h, v19.8h, #3 // 16/2 * src[16]
++ trn2 v1.2d, v3.2d, v1.2d
++ mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
++ ssra v21.8h, v21.8h, #1 // 12/2 * src[0]
++ ssra v18.8h, v18.8h, #1 // 12/2 * src[32]
++ mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16]
++ shl v19.8h, v1.8h, #3 // 16/2 * src[48]
++ mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
++ add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
++ mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
++ sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
++ sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
++ mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
++ mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
++ add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
++ add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
++ mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
++ sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
++ neg v21.8h, v17.8h // +t2
++ mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
++ sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
++ neg v4.8h, v5.8h // +t3
++ sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
++ sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
++ neg v24.8h, v16.8h // +t4
++ add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
++ add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
++ ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1
++ neg v3.8h, v2.8h // -t1
++ ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1
++ ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1
++ ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1
++ srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1
++ srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1
++ srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1
++ srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1
++ srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7
++ srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7
++ srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7
++ srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7
++ srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7
++ srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7
++ st1 {v2.16b, v3.16b}, [x1], #32
++ srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7
++ srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7
++ st1 {v4.16b, v5.16b}, [x1], #32
++ st1 {v16.16b, v17.16b}, [x1], #32
++ st1 {v0.16b, v1.16b}, [x1]
++ ret
++endfunc
++
++// VC-1 8x4 inverse transform
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> array of 16-bit inverse transform coefficients, in row-major order
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_neon, export=1
++ ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
++ mov x3, x0
++ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
++ ld1 {v5.8b}, [x0], x1
++ trn2 v6.4h, v1.4h, v3.4h
++ trn2 v7.4h, v2.4h, v4.4h
++ trn1 v1.4h, v1.4h, v3.4h
++ trn1 v2.4h, v2.4h, v4.4h
++ trn2 v3.4h, v16.4h, v18.4h
++ trn2 v4.4h, v17.4h, v19.4h
++ trn1 v16.4h, v16.4h, v18.4h
++ trn1 v17.4h, v17.4h, v19.4h
++ ld1 {v18.8b}, [x0], x1
++ trn1 v19.2s, v6.2s, v3.2s
++ trn2 v3.2s, v6.2s, v3.2s
++ trn1 v6.2s, v7.2s, v4.2s
++ trn2 v4.2s, v7.2s, v4.2s
++ trn1 v7.2s, v1.2s, v16.2s
++ trn1 v20.2s, v2.2s, v17.2s
++ shl v21.4h, v19.4h, #4 // 16 * src[1]
++ trn2 v1.2s, v1.2s, v16.2s
++ shl v16.4h, v3.4h, #4 // 16 * src[3]
++ trn2 v2.2s, v2.2s, v17.2s
++ shl v17.4h, v6.4h, #4 // 16 * src[5]
++ ld1 {v22.8b}, [x0], x1
++ shl v23.4h, v4.4h, #4 // 16 * src[7]
++ mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2]
++ ld1 {v25.8b}, [x0]
++ shl v26.4h, v19.4h, #2 // 4 * src[1]
++ shl v27.4h, v6.4h, #2 // 4 * src[5]
++ ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7]
++ ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5]
++ sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7]
++ sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5]
++ shl v7.4h, v7.4h, #2 // 8/2 * src[0]
++ shl v20.4h, v20.4h, #2 // 8/2 * src[4]
++ mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7]
++ shl v1.4h, v1.4h, #3 // 16/2 * src[2]
++ mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5]
++ ssra v7.4h, v7.4h, #1 // 12/2 * src[0]
++ mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5]
++ ssra v20.4h, v20.4h, #1 // 12/2 * src[4]
++ mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7]
++ shl v3.4h, v2.4h, #3 // 16/2 * src[6]
++ mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6]
++ mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7]
++ mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7]
++ sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6]
++ mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7]
++ add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4]
++ mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7]
++ sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4]
++ neg v6.4h, v21.4h // -t1
++ add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
++ sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
++ add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
++ sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
++ add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
++ add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
++ sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
++ sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
++ neg v3.4h, v17.4h // +t2
++ neg v4.4h, v16.4h // +t3
++ neg v28.4h, v23.4h // +t4
++ ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1
++ ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1
++ ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1
++ ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1
++ ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1
++ ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1
++ ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1
++ ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1
++ trn1 v1.2d, v7.2d, v1.2d
++ trn1 v2.2d, v20.2d, v2.2d
++ trn1 v3.2d, v24.2d, v27.2d
++ trn1 v4.2d, v19.2d, v26.2d
++ srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
++ srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
++ srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
++ srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
++ trn2 v6.8h, v1.8h, v2.8h
++ trn1 v1.8h, v1.8h, v2.8h
++ trn2 v2.8h, v3.8h, v4.8h
++ trn1 v3.8h, v3.8h, v4.8h
++ trn2 v4.4s, v6.4s, v2.4s
++ trn1 v7.4s, v1.4s, v3.4s
++ trn2 v1.4s, v1.4s, v3.4s
++ mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24]
++ trn1 v2.4s, v6.4s, v2.4s
++ mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24]
++ mul v6.8h, v7.8h, v0.h[6] // 17 * src[0]
++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[16]
++ mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
++ mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
++ add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16]
++ sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16]
++ neg v2.8h, v3.8h // -t4/2
++ neg v6.8h, v4.8h // -t3/2
++ ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1
++ ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1
++ ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1
++ ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1
++ srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7
++ srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7
++ srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7
++ srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7
++ uaddw v0.8h, v0.8h, v5.8b
++ uaddw v1.8h, v1.8h, v18.8b
++ uaddw v2.8h, v2.8h, v22.8b
++ uaddw v3.8h, v3.8h, v25.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3], x1
++ st1 {v2.8b}, [x3], x1
++ st1 {v3.8b}, [x3]
++ ret
++endfunc
++
++// VC-1 4x8 inverse transform
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_neon, export=1
++ mov x3, #16
++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
++ mov x4, x0
++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
++ ld1 {v4.d}[0], [x2], x3 // 30 31 32 33
++ ld1 {v1.d}[1], [x2], x3 // 40 41 42 43
++ ld1 {v2.d}[1], [x2], x3 // 50 51 52 53
++ ld1 {v3.d}[1], [x2], x3 // 60 61 62 63
++ ld1 {v4.d}[1], [x2] // 70 71 72 73
++ ld1 {v5.s}[0], [x0], x1
++ ld1 {v6.s}[0], [x0], x1
++ ld1 {v7.s}[0], [x0], x1
++ trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53
++ trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52
++ trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73
++ trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72
++ ld1 {v4.s}[0], [x0], x1
++ trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73
++ trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70
++ trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71
++ mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3]
++ ld1 {v5.s}[1], [x0], x1
++ mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3]
++ ld1 {v6.s}[1], [x0], x1
++ trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72
++ mul v3.8h, v18.8h, v0.h[6] // 17 * src[0]
++ ld1 {v7.s}[1], [x0], x1
++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[2]
++ ld1 {v4.s}[1], [x0]
++ mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
++ mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
++ add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2]
++ sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2]
++ neg v3.8h, v16.8h // -t3/2
++ ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1
++ neg v18.8h, v17.8h // -t4/2
++ ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1
++ ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1
++ ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1
++ srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3
++ srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3
++ srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3
++ srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3
++ trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73
++ trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71
++ trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61
++ trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63
++ trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53
++ trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73
++ trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43
++ mov d18, v3.d[1] // 50 51 52 53
++ shl v19.4h, v3.4h, #4 // 16 * src[8]
++ mov d20, v16.d[1] // 70 71 72 73
++ shl v21.4h, v16.4h, #4 // 16 * src[24]
++ mov d22, v17.d[1] // 40 41 42 43
++ shl v23.4h, v3.4h, #2 // 4 * src[8]
++ shl v24.4h, v18.4h, #4 // 16 * src[40]
++ shl v25.4h, v20.4h, #4 // 16 * src[56]
++ shl v26.4h, v18.4h, #2 // 4 * src[40]
++ trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63
++ ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40]
++ sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56]
++ shl v17.4h, v17.4h, #2 // 8/2 * src[0]
++ sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40]
++ shl v22.4h, v22.4h, #2 // 8/2 * src[32]
++ mov d23, v1.d[1] // 60 61 62 63
++ ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56]
++ mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16]
++ shl v1.4h, v1.4h, #3 // 16/2 * src[16]
++ mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
++ ssra v17.4h, v17.4h, #1 // 12/2 * src[0]
++ mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
++ ssra v22.4h, v22.4h, #1 // 12/2 * src[32]
++ mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
++ shl v3.4h, v23.4h, #3 // 16/2 * src[48]
++ mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
++ mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
++ mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
++ add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
++ sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
++ sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
++ mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
++ mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
++ add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
++ mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
++ sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
++ add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
++ sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
++ neg v23.4h, v24.4h // +t2
++ sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
++ add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
++ neg v17.4h, v21.4h // +t3
++ sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
++ add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
++ neg v16.4h, v19.4h // -t1
++ neg v27.4h, v2.4h // +t4
++ ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1
++ srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1
++ ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1
++ srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1
++ ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1
++ srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1
++ ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1
++ srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1
++ trn1 v0.2d, v20.2d, v0.2d
++ trn1 v2.2d, v18.2d, v22.2d
++ trn1 v3.2d, v25.2d, v3.2d
++ trn1 v1.2d, v26.2d, v1.2d
++ srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
++ srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
++ srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
++ srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
++ uaddw v0.8h, v0.8h, v5.8b
++ uaddw v2.8h, v2.8h, v6.8b
++ uaddw v3.8h, v3.8h, v7.8b
++ uaddw v1.8h, v1.8h, v4.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x4], x1
++ st1 {v2.s}[0], [x4], x1
++ st1 {v3.s}[0], [x4], x1
++ st1 {v1.s}[0], [x4], x1
++ st1 {v0.s}[1], [x4], x1
++ st1 {v2.s}[1], [x4], x1
++ st1 {v3.s}[1], [x4], x1
++ st1 {v1.s}[1], [x4]
++ ret
++endfunc
++
++// VC-1 4x4 inverse transform
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_neon, export=1
++ mov x3, #16
++ ldr d0, .Lcoeffs_it4
++ mov x4, x0
++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
++ ld1 {v4.d}[0], [x2] // 30 31 32 33
++ ld1 {v5.s}[0], [x0], x1
++ ld1 {v5.s}[1], [x0], x1
++ ld1 {v6.s}[0], [x0], x1
++ trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13
++ trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12
++ ld1 {v6.s}[1], [x0]
++ trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33
++ trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32
++ trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33
++ trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30
++ trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31
++ trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32
++ mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3]
++ mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3]
++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[2]
++ mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
++ mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
++ add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2]
++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2]
++ neg v7.4h, v3.4h // -t3/2
++ neg v16.4h, v4.4h // -t4/2
++ ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1
++ ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1
++ ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1
++ ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1
++ srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3
++ srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3
++ srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3
++ srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3
++ trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31
++ trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21
++ trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33
++ trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23
++ trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33
++ trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03
++ trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13
++ trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23
++ mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24]
++ mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24]
++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[16]
++ mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
++ mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
++ add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16]
++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16]
++ neg v3.4h, v2.4h // -t4/2
++ neg v7.4h, v4.4h // -t3/2
++ ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1
++ ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1
++ ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1
++ ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1
++ trn1 v0.2d, v4.2d, v3.2d
++ trn1 v1.2d, v2.2d, v7.2d
++ srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
++ srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
++ uaddw v0.8h, v0.8h, v5.8b
++ uaddw v1.8h, v1.8h, v6.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x4], x1
++ st1 {v0.s}[1], [x4], x1
++ st1 {v1.s}[0], [x4], x1
++ st1 {v1.s}[1], [x4]
++ ret
++endfunc
++
++// VC-1 8x8 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x8_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.8b}, [x0], x1
++ ld1 {v1.8b}, [x0], x1
++ ld1 {v2.8b}, [x0], x1
++ add w2, w2, w2, lsl #1
++ ld1 {v3.8b}, [x0], x1
++ ld1 {v4.8b}, [x0], x1
++ add w2, w2, #1
++ ld1 {v5.8b}, [x0], x1
++ asr w2, w2, #1
++ ld1 {v6.8b}, [x0], x1
++ add w2, w2, w2, lsl #1
++ ld1 {v7.8b}, [x0]
++ add w0, w2, #16
++ asr w0, w0, #5
++ dup v16.8h, w0
++ uaddw v0.8h, v16.8h, v0.8b
++ uaddw v1.8h, v16.8h, v1.8b
++ uaddw v2.8h, v16.8h, v2.8b
++ uaddw v3.8h, v16.8h, v3.8b
++ uaddw v4.8h, v16.8h, v4.8b
++ uaddw v5.8h, v16.8h, v5.8b
++ sqxtun v0.8b, v0.8h
++ uaddw v6.8h, v16.8h, v6.8b
++ sqxtun v1.8b, v1.8h
++ uaddw v7.8h, v16.8h, v7.8b
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ sqxtun v4.8b, v4.8h
++ st1 {v0.8b}, [x3], x1
++ sqxtun v0.8b, v5.8h
++ st1 {v1.8b}, [x3], x1
++ sqxtun v1.8b, v6.8h
++ st1 {v2.8b}, [x3], x1
++ sqxtun v2.8b, v7.8h
++ st1 {v3.8b}, [x3], x1
++ st1 {v4.8b}, [x3], x1
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3], x1
++ st1 {v2.8b}, [x3]
++ ret
++endfunc
++
++// VC-1 8x4 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.8b}, [x0], x1
++ ld1 {v1.8b}, [x0], x1
++ ld1 {v2.8b}, [x0], x1
++ add w2, w2, w2, lsl #1
++ ld1 {v3.8b}, [x0]
++ add w0, w2, #1
++ asr w0, w0, #1
++ add w0, w0, w0, lsl #4
++ add w0, w0, #64
++ asr w0, w0, #7
++ dup v4.8h, w0
++ uaddw v0.8h, v4.8h, v0.8b
++ uaddw v1.8h, v4.8h, v1.8b
++ uaddw v2.8h, v4.8h, v2.8b
++ uaddw v3.8h, v4.8h, v3.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3], x1
++ st1 {v2.8b}, [x3], x1
++ st1 {v3.8b}, [x3]
++ ret
++endfunc
++
++// VC-1 4x8 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.s}[0], [x0], x1
++ ld1 {v1.s}[0], [x0], x1
++ ld1 {v2.s}[0], [x0], x1
++ add w2, w2, w2, lsl #4
++ ld1 {v3.s}[0], [x0], x1
++ add w2, w2, #4
++ asr w2, w2, #3
++ add w2, w2, w2, lsl #1
++ ld1 {v0.s}[1], [x0], x1
++ add w2, w2, #16
++ asr w2, w2, #5
++ dup v4.8h, w2
++ ld1 {v1.s}[1], [x0], x1
++ ld1 {v2.s}[1], [x0], x1
++ ld1 {v3.s}[1], [x0]
++ uaddw v0.8h, v4.8h, v0.8b
++ uaddw v1.8h, v4.8h, v1.8b
++ uaddw v2.8h, v4.8h, v2.8b
++ uaddw v3.8h, v4.8h, v3.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ st1 {v0.s}[0], [x3], x1
++ st1 {v1.s}[0], [x3], x1
++ st1 {v2.s}[0], [x3], x1
++ st1 {v3.s}[0], [x3], x1
++ st1 {v0.s}[1], [x3], x1
++ st1 {v1.s}[1], [x3], x1
++ st1 {v2.s}[1], [x3], x1
++ st1 {v3.s}[1], [x3]
++ ret
++endfunc
++
++// VC-1 4x4 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.s}[0], [x0], x1
++ ld1 {v1.s}[0], [x0], x1
++ ld1 {v0.s}[1], [x0], x1
++ add w2, w2, w2, lsl #4
++ ld1 {v1.s}[1], [x0]
++ add w0, w2, #4
++ asr w0, w0, #3
++ add w0, w0, w0, lsl #4
++ add w0, w0, #64
++ asr w0, w0, #7
++ dup v2.8h, w0
++ uaddw v0.8h, v2.8h, v0.8b
++ uaddw v1.8h, v2.8h, v1.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x3], x1
++ st1 {v1.s}[0], [x3], x1
++ st1 {v0.s}[1], [x3], x1
++ st1 {v1.s}[1], [x3]
++ ret
++endfunc
++
++.align 5
++.Lcoeffs_it8:
++.quad 0x000F00090003
++.Lcoeffs_it4:
++.quad 0x0011000B0005
++.Lcoeffs:
++.quad 0x00050002
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of lower block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++ sub x3, x0, w1, sxtw #2
++ ldr d0, .Lcoeffs
++ ld1 {v1.s}[0], [x0], x1 // P5
++ ld1 {v2.s}[0], [x3], x1 // P1
++ ld1 {v3.s}[0], [x3], x1 // P2
++ ld1 {v4.s}[0], [x0], x1 // P6
++ ld1 {v5.s}[0], [x3], x1 // P3
++ ld1 {v6.s}[0], [x0], x1 // P7
++ ld1 {v7.s}[0], [x3] // P4
++ ld1 {v16.s}[0], [x0] // P8
++ ushll v17.8h, v1.8b, #1 // 2*P5
++ dup v18.8h, w2 // pq
++ ushll v2.8h, v2.8b, #1 // 2*P1
++ uxtl v3.8h, v3.8b // P2
++ uxtl v4.8h, v4.8b // P6
++ uxtl v19.8h, v5.8b // P3
++ mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2
++ uxtl v3.8h, v6.8b // P7
++ mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6
++ ushll v5.8h, v5.8b, #1 // 2*P3
++ uxtl v6.8h, v7.8b // P4
++ mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7
++ uxtl v3.8h, v16.8b // P8
++ mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3
++ uxtl v1.8h, v1.8b // P5
++ mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4
++ mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++ sub v3.4h, v6.4h, v1.4h // P4-P5
++ mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
++ mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5
++ mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ abs v4.4h, v3.4h
++ srshr v7.4h, v17.4h, #3
++ srshr v2.4h, v2.4h, #3
++ sshr v4.4h, v4.4h, #1 // clip
++ srshr v5.4h, v5.4h, #3
++ abs v7.4h, v7.4h // a2
++ sshr v3.4h, v3.4h, #8 // clip_sign
++ abs v2.4h, v2.4h // a1
++ cmeq v16.4h, v4.4h, #0 // test clip == 0
++ abs v17.4h, v5.4h // a0
++ sshr v5.4h, v5.4h, #8 // a0_sign
++ cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2
++ cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq
++ sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign
++ bsl v19.8b, v7.8b, v2.8b // a3
++ orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq
++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0
++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0
++ mov w0, v5.s[1] // move to gp reg
++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ cmhs v5.4h, v0.4h, v4.4h
++ tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered
++ bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip)
++ bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ sqxtun v0.8b, v6.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x3], x1
++ st1 {v1.s}[0], [x3]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of right block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++ sub x3, x0, #4 // where to start reading
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x3], x1
++ sub x0, x0, #1 // where to start writing
++ ld1 {v2.8b}, [x3], x1
++ ld1 {v3.8b}, [x3], x1
++ ld1 {v4.8b}, [x3]
++ dup v5.8h, w2 // pq
++ trn1 v6.8b, v1.8b, v2.8b
++ trn2 v1.8b, v1.8b, v2.8b
++ trn1 v2.8b, v3.8b, v4.8b
++ trn2 v3.8b, v3.8b, v4.8b
++ trn1 v4.4h, v6.4h, v2.4h // P1, P5
++ trn1 v7.4h, v1.4h, v3.4h // P2, P6
++ trn2 v2.4h, v6.4h, v2.4h // P3, P7
++ trn2 v1.4h, v1.4h, v3.4h // P4, P8
++ ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5
++ uxtl v6.8h, v7.8b // P2, P6
++ uxtl v7.8h, v2.8b // P3, P7
++ uxtl v1.8h, v1.8b // P4, P8
++ mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6
++ ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7
++ uxtl v4.8h, v4.8b // P1, P5
++ mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++ mov d6, v6.d[1] // P6
++ mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++ mov d4, v4.d[1] // P5
++ mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4
++ mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5
++ sub v7.4h, v1.4h, v4.4h // P4-P5
++ mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ srshr v3.8h, v3.8h, #3
++ abs v6.4h, v7.4h
++ sshr v7.4h, v7.4h, #8 // clip_sign
++ srshr v2.4h, v2.4h, #3
++ abs v3.8h, v3.8h // a1, a2
++ sshr v6.4h, v6.4h, #1 // clip
++ mov d16, v3.d[1] // a2
++ abs v17.4h, v2.4h // a0
++ cmeq v18.4h, v6.4h, #0 // test clip == 0
++ sshr v2.4h, v2.4h, #8 // a0_sign
++ cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2
++ cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq
++ sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign
++ bsl v19.8b, v16.8b, v3.8b // a3
++ orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq
++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0
++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0
++ mov w2, v5.s[1] // move to gp reg
++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ cmhs v5.4h, v0.4h, v6.4h
++ tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered
++ bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip)
++ bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ sqxtun v3.8b, v4.8h
++ sqxtun v2.8b, v1.8h
++ st2 {v2.b, v3.b}[0], [x0], x1
++ st2 {v2.b, v3.b}[1], [x0], x1
++ st2 {v2.b, v3.b}[2], [x0], x1
++ st2 {v2.b, v3.b}[3], [x0]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of lower block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++ sub x3, x0, w1, sxtw #2
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x0], x1 // P5
++ movi v2.2d, #0x0000ffff00000000
++ ld1 {v3.8b}, [x3], x1 // P1
++ ld1 {v4.8b}, [x3], x1 // P2
++ ld1 {v5.8b}, [x0], x1 // P6
++ ld1 {v6.8b}, [x3], x1 // P3
++ ld1 {v7.8b}, [x0], x1 // P7
++ ushll v16.8h, v1.8b, #1 // 2*P5
++ ushll v3.8h, v3.8b, #1 // 2*P1
++ ld1 {v17.8b}, [x3] // P4
++ uxtl v4.8h, v4.8b // P2
++ ld1 {v18.8b}, [x0] // P8
++ uxtl v5.8h, v5.8b // P6
++ dup v19.8h, w2 // pq
++ uxtl v20.8h, v6.8b // P3
++ mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2
++ uxtl v4.8h, v7.8b // P7
++ ushll v6.8h, v6.8b, #1 // 2*P3
++ mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6
++ uxtl v7.8h, v17.8b // P4
++ uxtl v17.8h, v18.8b // P8
++ mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7
++ uxtl v1.8h, v1.8b // P5
++ mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3
++ sub v4.8h, v7.8h, v1.8h // P4-P5
++ mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4
++ mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++ abs v17.8h, v4.8h
++ sshr v4.8h, v4.8h, #8 // clip_sign
++ mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
++ sshr v17.8h, v17.8h, #1 // clip
++ mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5
++ srshr v16.8h, v16.8h, #3
++ mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ cmeq v5.8h, v17.8h, #0 // test clip == 0
++ srshr v3.8h, v3.8h, #3
++ abs v16.8h, v16.8h // a2
++ abs v3.8h, v3.8h // a1
++ srshr v6.8h, v6.8h, #3
++ cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2
++ abs v20.8h, v6.8h // a0
++ sshr v6.8h, v6.8h, #8 // a0_sign
++ bsl v18.16b, v16.16b, v3.16b // a3
++ cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq
++ sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign
++ uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0
++ orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq
++ mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
++ cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either
++ mov w0, v5.s[1] // move to gp reg
++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ mov w2, v5.s[3]
++ orr v2.16b, v3.16b, v2.16b
++ cmhs v3.8h, v0.8h, v17.8h
++ and w0, w0, w2
++ bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
++ tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case
++ bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered
++ mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ sqxtun v0.8b, v7.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of right block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++ sub x3, x0, #4 // where to start reading
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]...
++ sub x0, x0, #1 // where to start writing
++ ld1 {v2.8b}, [x3], x1
++ add x4, x0, x1, lsl #2
++ ld1 {v3.8b}, [x3], x1
++ ld1 {v4.8b}, [x3], x1
++ ld1 {v5.8b}, [x3], x1
++ ld1 {v6.8b}, [x3], x1
++ ld1 {v7.8b}, [x3], x1
++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]...
++ ld1 {v17.8b}, [x3]
++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]...
++ trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]...
++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]...
++ dup v4.8h, w2 // pq
++ trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]...
++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]...
++ trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]...
++ trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]...
++ trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]...
++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]...
++ trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]...
++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]...
++ trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]...
++ trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]...
++ trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]...
++ trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]...
++ trn1 v7.2s, v6.2s, v3.2s // P1
++ trn1 v18.2s, v19.2s, v16.2s // P2
++ trn2 v3.2s, v6.2s, v3.2s // P5
++ trn2 v6.2s, v19.2s, v16.2s // P6
++ trn1 v16.2s, v2.2s, v17.2s // P3
++ trn2 v2.2s, v2.2s, v17.2s // P7
++ ushll v7.8h, v7.8b, #1 // 2*P1
++ trn1 v17.2s, v1.2s, v5.2s // P4
++ ushll v19.8h, v3.8b, #1 // 2*P5
++ trn2 v1.2s, v1.2s, v5.2s // P8
++ uxtl v5.8h, v18.8b // P2
++ uxtl v6.8h, v6.8b // P6
++ uxtl v18.8h, v16.8b // P3
++ mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2
++ uxtl v2.8h, v2.8b // P7
++ ushll v5.8h, v16.8b, #1 // 2*P3
++ mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6
++ uxtl v16.8h, v17.8b // P4
++ uxtl v1.8h, v1.8b // P8
++ mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7
++ uxtl v2.8h, v3.8b // P5
++ mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3
++ sub v3.8h, v16.8h, v2.8h // P4-P5
++ mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4
++ mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++ abs v1.8h, v3.8h
++ sshr v3.8h, v3.8h, #8 // clip_sign
++ mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
++ sshr v1.8h, v1.8h, #1 // clip
++ mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5
++ srshr v17.8h, v19.8h, #3
++ mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ cmeq v6.8h, v1.8h, #0 // test clip == 0
++ srshr v7.8h, v7.8h, #3
++ abs v17.8h, v17.8h // a2
++ abs v7.8h, v7.8h // a1
++ srshr v5.8h, v5.8h, #3
++ cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2
++ abs v19.8h, v5.8h // a0
++ sshr v5.8h, v5.8h, #8 // a0_sign
++ bsl v18.16b, v17.16b, v7.16b // a3
++ cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq
++ sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign
++ uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0
++ orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq
++ mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0
++ mov w2, v5.s[1] // move to gp reg
++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ mov w3, v5.s[3]
++ cmhs v5.8h, v0.8h, v1.8h
++ and w5, w2, w3
++ bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip)
++ tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case
++ bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ sqxtun v1.8b, v2.8h
++ sqxtun v0.8b, v16.8h
++ tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so
++ st2 {v0.b, v1.b}[0], [x0], x1
++ st2 {v0.b, v1.b}[1], [x0], x1
++ st2 {v0.b, v1.b}[2], [x0], x1
++ st2 {v0.b, v1.b}[3], [x0]
++1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so
++ st2 {v0.b, v1.b}[4], [x4], x1
++ st2 {v0.b, v1.b}[5], [x4], x1
++ st2 {v0.b, v1.b}[6], [x4], x1
++ st2 {v0.b, v1.b}[7], [x4]
++2: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of lower block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++ sub x3, x0, w1, sxtw #2
++ ldr d0, .Lcoeffs
++ ld1 {v1.16b}, [x0], x1 // P5
++ movi v2.2d, #0x0000ffff00000000
++ ld1 {v3.16b}, [x3], x1 // P1
++ ld1 {v4.16b}, [x3], x1 // P2
++ ld1 {v5.16b}, [x0], x1 // P6
++ ld1 {v6.16b}, [x3], x1 // P3
++ ld1 {v7.16b}, [x0], x1 // P7
++ ushll v16.8h, v1.8b, #1 // 2*P5[0..7]
++ ushll v17.8h, v3.8b, #1 // 2*P1[0..7]
++ ld1 {v18.16b}, [x3] // P4
++ uxtl v19.8h, v4.8b // P2[0..7]
++ ld1 {v20.16b}, [x0] // P8
++ uxtl v21.8h, v5.8b // P6[0..7]
++ dup v22.8h, w2 // pq
++ ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15]
++ mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++ ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15]
++ uxtl2 v4.8h, v4.16b // P2[8..15]
++ mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++ uxtl2 v5.8h, v5.16b // P6[8..15]
++ uxtl v23.8h, v6.8b // P3[0..7]
++ uxtl v24.8h, v7.8b // P7[0..7]
++ mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++ ushll v4.8h, v6.8b, #1 // 2*P3[0..7]
++ uxtl v25.8h, v18.8b // P4[0..7]
++ mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++ uxtl2 v26.8h, v6.16b // P3[8..15]
++ mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ uxtl2 v7.8h, v7.16b // P7[8..15]
++ ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15]
++ mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ uxtl2 v18.8h, v18.16b // P4[8..15]
++ uxtl v23.8h, v20.8b // P8[0..7]
++ mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]
++ uxtl v24.8h, v1.8b // P5[0..7]
++ uxtl2 v20.8h, v20.16b // P8[8..15]
++ mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ uxtl2 v1.8h, v1.16b // P5[8..15]
++ sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7]
++ mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15]
++ mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]
++ abs v27.8h, v26.8h
++ sshr v26.8h, v26.8h, #8 // clip_sign[0..7]
++ mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ abs v28.8h, v7.8h
++ sshr v27.8h, v27.8h, #1 // clip[0..7]
++ mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ sshr v7.8h, v7.8h, #8 // clip_sign[8..15]
++ sshr v23.8h, v28.8h, #1 // clip[8..15]
++ mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0
++ srshr v17.8h, v17.8h, #3
++ mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0
++ srshr v16.8h, v16.8h, #3
++ mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ abs v17.8h, v17.8h // a1[0..7]
++ mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ srshr v3.8h, v3.8h, #3
++ mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ abs v16.8h, v16.8h // a2[0..7]
++ srshr v19.8h, v19.8h, #3
++ mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7]
++ abs v3.8h, v3.8h // a1[8..15]
++ srshr v4.8h, v4.8h, #3
++ abs v19.8h, v19.8h // a2[8..15]
++ bsl v5.16b, v16.16b, v17.16b // a3[0..7]
++ srshr v6.8h, v6.8h, #3
++ cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15]
++ abs v17.8h, v4.8h // a0[0..7]
++ sshr v4.8h, v4.8h, #8 // a0_sign[0..7]
++ bsl v16.16b, v19.16b, v3.16b // a3[8..15]
++ uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ abs v19.8h, v6.8h // a0[8..15]
++ cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq
++ cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7]
++ sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7]
++ sshr v6.8h, v6.8h, #8 // a0_sign[8..15]
++ mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++ cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq
++ cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15]
++ mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15]
++ orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++ cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either
++ mov w0, v5.s[1] // move to gp reg
++ cmhs v19.8h, v3.8h, v27.8h
++ ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ mov w2, v5.s[3]
++ orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ orr v16.16b, v20.16b, v17.16b
++ bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
++ cmtst v2.2d, v5.2d, v2.2d
++ cmhs v3.8h, v0.8h, v23.8h
++ mov w4, v5.s[1]
++ mov w5, v5.s[3]
++ and w0, w0, w2
++ bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ orr v2.16b, v7.16b, v2.16b
++ bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
++ mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++ and w2, w4, w5
++ bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++ and w0, w0, w2
++ mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++ sqxtun v2.8b, v25.8h
++ tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case
++ mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++ sqxtun v0.8b, v24.8h
++ sqxtun2 v2.16b, v18.8h
++ sqxtun2 v0.16b, v1.8h
++ st1 {v2.16b}, [x3], x1
++ st1 {v0.16b}, [x3]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of right block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++ sub x3, x0, #4 // where to start reading
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]...
++ sub x0, x0, #1 // where to start writing
++ ld1 {v2.8b}, [x3], x1
++ add x4, x0, x1, lsl #3
++ ld1 {v3.8b}, [x3], x1
++ add x5, x0, x1, lsl #2
++ ld1 {v4.8b}, [x3], x1
++ add x6, x4, x1, lsl #2
++ ld1 {v5.8b}, [x3], x1
++ ld1 {v6.8b}, [x3], x1
++ ld1 {v7.8b}, [x3], x1
++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]...
++ ld1 {v17.8b}, [x3], x1
++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]...
++ ld1 {v2.8b}, [x3], x1
++ trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]...
++ ld1 {v19.8b}, [x3], x1
++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]...
++ ld1 {v4.8b}, [x3], x1
++ trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]...
++ ld1 {v21.8b}, [x3], x1
++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]...
++ ld1 {v6.8b}, [x3], x1
++ trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]...
++ ld1 {v23.8b}, [x3], x1
++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]...
++ ld1 {v17.8b}, [x3], x1
++ trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]...
++ ld1 {v25.8b}, [x3]
++ trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]...
++ trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]...
++ trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]...
++ trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]...
++ trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]...
++ trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]...
++ trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]...
++ trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]...
++ trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]...
++ trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]...
++ trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]...
++ trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]...
++ trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]...
++ trn1 v31.2s, v19.2s, v27.2s // P1[0..7]
++ trn2 v19.2s, v19.2s, v27.2s // P5[0..7]
++ trn1 v27.2s, v21.2s, v23.2s // P2[0..7]
++ trn2 v21.2s, v21.2s, v23.2s // P6[0..7]
++ trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]...
++ trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]...
++ trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]...
++ trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]...
++ trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]...
++ trn1 v24.2s, v29.2s, v23.2s // P1[8..15]
++ trn2 v23.2s, v29.2s, v23.2s // P5[8..15]
++ trn1 v26.2s, v25.2s, v18.2s // P2[8..15]
++ trn2 v18.2s, v25.2s, v18.2s // P6[8..15]
++ trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]...
++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]...
++ trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]...
++ trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]...
++ trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]...
++ ushll v5.8h, v31.8b, #1 // 2*P1[0..7]
++ ushll v6.8h, v19.8b, #1 // 2*P5[0..7]
++ trn1 v7.2s, v16.2s, v20.2s // P3[0..7]
++ uxtl v17.8h, v27.8b // P2[0..7]
++ trn2 v16.2s, v16.2s, v20.2s // P7[0..7]
++ uxtl v20.8h, v21.8b // P6[0..7]
++ trn1 v21.2s, v22.2s, v25.2s // P3[8..15]
++ ushll v24.8h, v24.8b, #1 // 2*P1[8..15]
++ trn2 v22.2s, v22.2s, v25.2s // P7[8..15]
++ ushll v25.8h, v23.8b, #1 // 2*P5[8..15]
++ trn1 v27.2s, v1.2s, v3.2s // P4[0..7]
++ uxtl v26.8h, v26.8b // P2[8..15]
++ mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++ uxtl v17.8h, v18.8b // P6[8..15]
++ mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++ trn1 v18.2s, v2.2s, v4.2s // P4[8..15]
++ uxtl v28.8h, v7.8b // P3[0..7]
++ mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++ uxtl v16.8h, v16.8b // P7[0..7]
++ uxtl v26.8h, v21.8b // P3[8..15]
++ mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++ uxtl v22.8h, v22.8b // P7[8..15]
++ ushll v7.8h, v7.8b, #1 // 2*P3[0..7]
++ uxtl v27.8h, v27.8b // P4[0..7]
++ trn2 v1.2s, v1.2s, v3.2s // P8[0..7]
++ ushll v3.8h, v21.8b, #1 // 2*P3[8..15]
++ trn2 v2.2s, v2.2s, v4.2s // P8[8..15]
++ uxtl v4.8h, v18.8b // P4[8..15]
++ mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ uxtl v1.8h, v1.8b // P8[0..7]
++ mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ uxtl v2.8h, v2.8b // P8[8..15]
++ uxtl v16.8h, v19.8b // P5[0..7]
++ mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ uxtl v18.8h, v23.8b // P5[8..15]
++ dup v19.8h, w2 // pq
++ mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7]
++ sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15]
++ mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]
++ abs v23.8h, v21.8h
++ mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]
++ abs v26.8h, v22.8h
++ sshr v21.8h, v21.8h, #8 // clip_sign[0..7]
++ mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ sshr v23.8h, v23.8h, #1 // clip[0..7]
++ sshr v26.8h, v26.8h, #1 // clip[8..15]
++ mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ sshr v1.8h, v22.8h, #8 // clip_sign[8..15]
++ cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0
++ mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0
++ srshr v5.8h, v5.8h, #3
++ mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ srshr v2.8h, v6.8h, #3
++ mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ srshr v6.8h, v24.8h, #3
++ mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ abs v5.8h, v5.8h // a1[0..7]
++ srshr v24.8h, v25.8h, #3
++ mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ abs v2.8h, v2.8h // a2[0..7]
++ abs v6.8h, v6.8h // a1[8..15]
++ mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ abs v17.8h, v24.8h // a2[8..15]
++ cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7]
++ srshr v3.8h, v3.8h, #3
++ cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15]
++ srshr v7.8h, v7.8h, #3
++ bsl v20.16b, v2.16b, v5.16b // a3[0..7]
++ abs v2.8h, v3.8h // a0[8..15]
++ sshr v3.8h, v3.8h, #8 // a0_sign[8..15]
++ bsl v24.16b, v17.16b, v6.16b // a3[8..15]
++ abs v5.8h, v7.8h // a0[0..7]
++ sshr v6.8h, v7.8h, #8 // a0_sign[0..7]
++ cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq
++ sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15]
++ uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15]
++ uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq
++ orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++ sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7]
++ mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7]
++ orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++ mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ mov w7, v2.s[1]
++ mov w8, v2.s[3]
++ ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ mov w2, v5.s[1] // move to gp reg
++ cmhs v2.8h, v3.8h, v26.8h
++ mov w3, v5.s[3]
++ cmhs v5.8h, v0.8h, v23.8h
++ bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
++ and w9, w7, w8
++ bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
++ and w10, w2, w3
++ bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ and w9, w10, w9
++ bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++ tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case
++ mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++ mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++ sqxtun v2.8b, v4.8h
++ mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++ sqxtun v0.8b, v27.8h
++ sqxtun v1.8b, v16.8h
++ sqxtun v3.8b, v18.8h
++ tbnz w2, #0, 1f
++ st2 {v0.b, v1.b}[0], [x0], x1
++ st2 {v0.b, v1.b}[1], [x0], x1
++ st2 {v0.b, v1.b}[2], [x0], x1
++ st2 {v0.b, v1.b}[3], [x0]
++1: tbnz w3, #0, 2f
++ st2 {v0.b, v1.b}[4], [x5], x1
++ st2 {v0.b, v1.b}[5], [x5], x1
++ st2 {v0.b, v1.b}[6], [x5], x1
++ st2 {v0.b, v1.b}[7], [x5]
++2: tbnz w7, #0, 3f
++ st2 {v2.b, v3.b}[0], [x4], x1
++ st2 {v2.b, v3.b}[1], [x4], x1
++ st2 {v2.b, v3.b}[2], [x4], x1
++ st2 {v2.b, v3.b}[3], [x4]
++3: tbnz w8, #0, 4f
++ st2 {v2.b, v3.b}[4], [x6], x1
++ st2 {v2.b, v3.b}[5], [x6], x1
++ st2 {v2.b, v3.b}[6], [x6], x1
++ st2 {v2.b, v3.b}[7], [x6]
++4: ret
++endfunc
++
++// Copy at most the specified number of bytes from source to destination buffer,
++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
++// On entry:
++// x0 -> source buffer
++// w1 = max number of bytes to copy
++// x2 -> destination buffer, optimally 8-byte aligned
++// On exit:
++// w0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++ // Offset by 80 to screen out cases that are too short for us to handle,
++ // and also make it easy to test for loop termination, or to determine
++ // whether we need an odd number of half-iterations of the loop.
++ subs w1, w1, #80
++ b.mi 90f
++
++ // Set up useful constants
++ movi v20.4s, #3, lsl #24
++ movi v21.4s, #3, lsl #16
++
++ tst w1, #32
++ b.ne 1f
++
++ ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48
++ ext v25.16b, v0.16b, v1.16b, #1
++ ext v26.16b, v0.16b, v1.16b, #2
++ ext v27.16b, v0.16b, v1.16b, #3
++ ext v29.16b, v1.16b, v2.16b, #1
++ ext v30.16b, v1.16b, v2.16b, #2
++ ext v31.16b, v1.16b, v2.16b, #3
++ bic v24.16b, v0.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v1.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ add w1, w1, #32
++ b 3f
++
++1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48
++ ext v25.16b, v3.16b, v4.16b, #1
++ ext v26.16b, v3.16b, v4.16b, #2
++ ext v27.16b, v3.16b, v4.16b, #3
++ ext v29.16b, v4.16b, v5.16b, #1
++ ext v30.16b, v4.16b, v5.16b, #2
++ ext v31.16b, v4.16b, v5.16b, #3
++ bic v24.16b, v3.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v4.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ // Drop through...
++2: mov v0.16b, v5.16b
++ ld1 {v1.16b, v2.16b}, [x0], #32
++ cmeq v28.4s, v28.4s, #0
++ cmeq v29.4s, v29.4s, #0
++ cmeq v30.4s, v30.4s, #0
++ cmeq v31.4s, v31.4s, #0
++ orr v24.16b, v24.16b, v25.16b
++ orr v26.16b, v26.16b, v27.16b
++ orr v28.16b, v28.16b, v29.16b
++ orr v30.16b, v30.16b, v31.16b
++ ext v25.16b, v0.16b, v1.16b, #1
++ orr v22.16b, v24.16b, v26.16b
++ ext v26.16b, v0.16b, v1.16b, #2
++ ext v27.16b, v0.16b, v1.16b, #3
++ ext v29.16b, v1.16b, v2.16b, #1
++ orr v23.16b, v28.16b, v30.16b
++ ext v30.16b, v1.16b, v2.16b, #2
++ ext v31.16b, v1.16b, v2.16b, #3
++ bic v24.16b, v0.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ orr v22.16b, v22.16b, v23.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v1.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ addv s22, v22.4s
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ mov w3, v22.s[0]
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ cbnz w3, 90f
++ st1 {v3.16b, v4.16b}, [x2], #32
++3: mov v3.16b, v2.16b
++ ld1 {v4.16b, v5.16b}, [x0], #32
++ cmeq v28.4s, v28.4s, #0
++ cmeq v29.4s, v29.4s, #0
++ cmeq v30.4s, v30.4s, #0
++ cmeq v31.4s, v31.4s, #0
++ orr v24.16b, v24.16b, v25.16b
++ orr v26.16b, v26.16b, v27.16b
++ orr v28.16b, v28.16b, v29.16b
++ orr v30.16b, v30.16b, v31.16b
++ ext v25.16b, v3.16b, v4.16b, #1
++ orr v22.16b, v24.16b, v26.16b
++ ext v26.16b, v3.16b, v4.16b, #2
++ ext v27.16b, v3.16b, v4.16b, #3
++ ext v29.16b, v4.16b, v5.16b, #1
++ orr v23.16b, v28.16b, v30.16b
++ ext v30.16b, v4.16b, v5.16b, #2
++ ext v31.16b, v4.16b, v5.16b, #3
++ bic v24.16b, v3.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ orr v22.16b, v22.16b, v23.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v4.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ addv s22, v22.4s
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ mov w3, v22.s[0]
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ cbnz w3, 91f
++ st1 {v0.16b, v1.16b}, [x2], #32
++ subs w1, w1, #64
++ b.pl 2b
++
++90: add w0, w1, #80
++ ret
++
++91: sub w1, w1, #32
++ b 90b
++endfunc
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
+@@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder;
+ extern AVCodec ff_hevc_decoder;
+ extern AVCodec ff_hevc_qsv_decoder;
+ extern AVCodec ff_hevc_rkmpp_decoder;
++extern AVCodec ff_hevc_rpi_decoder;
+ extern AVCodec ff_hevc_v4l2m2m_decoder;
+ extern AVCodec ff_hnm4_video_decoder;
+ extern AVCodec ff_hq_hqa_decoder;
+@@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_c
+ }
+ }
+
++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
++{
++ const enum AVPixelFormat *pf = p->pix_fmts;
++
++ // Assume good if we lack info
++ if (pf == NULL)
++ return 1;
++ if (fmt == AV_PIX_FMT_NONE)
++ return 0;
++
++ for (; *pf != AV_PIX_FMT_NONE; ++pf) {
++ if (*pf == fmt)
++ return 1;
++ }
++ return 0;
++}
++
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
++{
++ const AVCodec *p, *experimental = NULL;
++ void *i = 0;
++
++ id= remap_deprecated_codec_id(id);
++ while ((p = av_codec_iterate(&i))) {
++ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
++ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
++ experimental = p;
++ } else
++ return (AVCodec *)p;
++ }
++ p = p->next;
++ }
++ return (AVCodec *)experimental;
++}
++
+ static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
+ {
+ const AVCodec *p, *experimental = NULL;
+--- a/libavcodec/arm/Makefile
++++ b/libavcodec/arm/Makefile
+@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) +
+ arm/sbrdsp_init_arm.o
+ OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
+ OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \
++ arm/rpi_hevcpred_init_arm.o
+ OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
+ OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
+ OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
+@@ -140,10 +142,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) +
+ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
+ NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
+ NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
++ arm/hevcdsp_idct_neon.o \
+ arm/hevcdsp_deblock_neon.o \
+ arm/hevcdsp_idct_neon.o \
+ arm/hevcdsp_qpel_neon.o \
+ arm/hevcdsp_sao_neon.o
++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \
++ arm/rpi_hevc_misc_neon.o \
++ arm/rpi_hevcdsp_deblock_neon.o \
++ arm/rpi_hevcdsp_idct_neon.o \
++ arm/rpi_hevcdsp_res8_neon.o \
++ arm/rpi_hevcdsp_res16_neon.o \
++ arm/rpi_hevcdsp_sao_neon.o \
++ arm/rpi_hevcpred_init_neon.o \
++ arm/rpi_hevcpred_intra_angular_neon.o \
++ arm/rpi_hevcpred_intra_dc_neon.o \
++ arm/rpi_hevcpred_intra_filter_neon.o \
++ arm/rpi_hevcpred_intra_hv_neon.o \
++ arm/rpi_hevcpred_intra_planar_neon.o
+ NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
+ NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
+ arm/rv40dsp_neon.o
+--- a/libavcodec/arm/cabac.h
++++ b/libavcodec/arm/cabac.h
+@@ -26,83 +26,209 @@
+ #include "libavutil/internal.h"
+ #include "libavcodec/cabac.h"
+
++
+ #define get_cabac_inline get_cabac_inline_arm
+ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
+- uint8_t *const state)
++ uint8_t *state)
+ {
+- int bit;
+- void *reg_b, *reg_c, *tmp;
++ const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
++ int bit, ptr, low, tmp1, tmp2;
++ __asm__ volatile (
++ "ldr %[bit], [%[c], %[range_off]] \n\t"
++ "ldrb %[ptr], [%[state]] \n\t"
++ "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t"
++ "and %[tmp2], %[bit], #0xc0 \n\t"
++ "add %[tmp1], %[tmp1], %[ptr] \n\t"
++ "ldr %[low], [%[c], %[low_off]] \n\t"
++ "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t"
++ "sub %[bit], %[bit], %[tmp2] \n\t"
++ "mov %[tmp1], %[bit] \n\t"
++ "cmp %[low], %[bit], lsl #17 \n\t"
++ "itt ge \n\t"
++ "movge %[tmp1], %[tmp2] \n\t"
++ "mvnge %[ptr], %[ptr] \n\t"
++ "clz %[tmp2], %[tmp1] \n\t"
++ "it ge \n\t"
++ "subge %[low], %[low], %[bit], lsl #17 \n\t"
++ "sub %[tmp2], %[tmp2], #23 \n\t"
++ "and %[bit], %[ptr], #1 \n\t"
++ "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
++ "lsl %[low], %[low], %[tmp2] \n\t"
++ "lsls %[ptr], %[low], #16 \n\t"
++ "bne 1f \n\t"
++ "ldr %[ptr], [%[c], %[ptr_off]] \n\t"
++ "lsl %[tmp2], %[tmp1], %[tmp2] \n\t"
++#if UNCHECKED_BITSTREAM_READER
++ "strb %[mlps_tables], [%[state]] \n\t"
++ "rbit %[state], %[low] \n\t"
++ "ldrh %[tmp1], [%[ptr]], #2 \n\t"
++#else
++ "ldr %[tmp1], [%[c], %[end_off]] \n\t"
++ "strb %[mlps_tables], [%[state]] \n\t"
++ "rbit %[state], %[low] \n\t"
++ "cmp %[tmp1], %[ptr] \n\t"
++#if CONFIG_THUMB
++ "it cs \n\t"
++ "ldrhcs %[tmp1], [%[ptr]], #2 \n\t"
++#else
++ "ldrcsh %[tmp1], [%[ptr]], #2 \n\t"
++#endif
++#endif
++ "clz %[state], %[state] \n\t"
++ "movw %[mlps_tables], #0xffff \n\t"
++ "sub %[state], %[state], #16 \n\t"
++ "str %[tmp2], [%[c], %[range_off]] \n\t"
++ "rev %[tmp1], %[tmp1] \n\t"
++ "str %[ptr], [%[c], %[ptr_off]] \n\t"
++ "lsr %[tmp1], %[tmp1], #15 \n\t"
++ "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t"
++#if CONFIG_THUMB
++ "lsl %[tmp1], %[tmp1], %[state] \n\t"
++ "add %[low], %[low], %[tmp1] \n\t"
++#else
++ "add %[low], %[low], %[tmp1], lsl %[state] \n\t"
++#endif
++ "str %[low], [%[c], %[low_off]] \n\t"
++ "b 2f \n\t"
++ "1: \n\t"
++ "strb %[mlps_tables], [%[state]] \n\t"
++ "lsl %[tmp1], %[tmp1], %[tmp2] \n\t"
++ "str %[low], [%[c], %[low_off]] \n\t"
++ "str %[tmp1], [%[c], %[range_off]] \n\t"
++ "2: \n\t"
++ : // Outputs
++ [state]"+r"(state),
++ [mlps_tables]"+r"(mlps_tables),
++ [bit]"=&r"(bit),
++ [ptr]"=&r"(ptr),
++ [low]"=&r"(low),
++ [tmp1]"=&r"(tmp1),
++ [tmp2]"=&r"(tmp2)
++ : // Inputs
++ [c]"r"(c),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [end_off]"J"(offsetof(CABACContext, bytestream_end)),
++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++ : // Clobbers
++ "cc", "memory"
++ );
++ return bit;
++}
+
+- __asm__ volatile(
+- "ldrb %[bit] , [%[state]] \n\t"
+- "add %[r_b] , %[tables] , %[lps_off] \n\t"
+- "mov %[tmp] , %[range] \n\t"
+- "and %[range] , %[range] , #0xC0 \n\t"
+- "add %[r_b] , %[r_b] , %[bit] \n\t"
+- "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t"
+- "add %[r_b] , %[tables] , %[norm_off] \n\t"
+- "sub %[r_c] , %[tmp] , %[range] \n\t"
+- "lsl %[tmp] , %[r_c] , #17 \n\t"
+- "cmp %[tmp] , %[low] \n\t"
+- "it gt \n\t"
+- "movgt %[range] , %[r_c] \n\t"
+- "itt cc \n\t"
+- "mvncc %[bit] , %[bit] \n\t"
+- "subcc %[low] , %[low] , %[tmp] \n\t"
+- "add %[r_c] , %[tables] , %[mlps_off] \n\t"
+- "ldrb %[tmp] , [%[r_b], %[range]] \n\t"
+- "ldrb %[r_b] , [%[r_c], %[bit]] \n\t"
+- "lsl %[low] , %[low] , %[tmp] \n\t"
+- "lsl %[range] , %[range] , %[tmp] \n\t"
+- "uxth %[r_c] , %[low] \n\t"
+- "strb %[r_b] , [%[state]] \n\t"
+- "tst %[r_c] , %[r_c] \n\t"
+- "bne 2f \n\t"
+- "ldr %[r_c] , [%[c], %[byte]] \n\t"
++#define get_cabac_bypass get_cabac_bypass_arm
++static inline int get_cabac_bypass_arm(CABACContext * const c)
++{
++ uint32_t low = c->low, range, ptr, tmp;
++ int rv;
++ __asm volatile (
++ "ldr %[range] , [%[c], %[range_off]] \n\t"
++ "mov %[rv] , #0 \n\t"
++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "lsl %[low] , #1 \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++ "ldr %[tmp] , [%[c], %[end_off]] \n\t"
++#endif
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "itt cs \n\t"
++ "subcs %[low] , %[low], %[range], lsl #17 \n\t"
++ "movcs %[rv] , #1 \n\t"
+ #if UNCHECKED_BITSTREAM_READER
+- "ldrh %[tmp] , [%[r_c]] \n\t"
+- "add %[r_c] , %[r_c] , #2 \n\t"
+- "str %[r_c] , [%[c], %[byte]] \n\t"
+-#else
+- "ldr %[r_b] , [%[c], %[end]] \n\t"
+- "ldrh %[tmp] , [%[r_c]] \n\t"
+- "cmp %[r_c] , %[r_b] \n\t"
+- "itt lt \n\t"
+- "addlt %[r_c] , %[r_c] , #2 \n\t"
+- "strlt %[r_c] , [%[c], %[byte]] \n\t"
+-#endif
+- "sub %[r_c] , %[low] , #1 \n\t"
+- "add %[r_b] , %[tables] , %[norm_off] \n\t"
+- "eor %[r_c] , %[low] , %[r_c] \n\t"
+- "rev %[tmp] , %[tmp] \n\t"
+- "lsr %[r_c] , %[r_c] , #15 \n\t"
+- "lsr %[tmp] , %[tmp] , #15 \n\t"
+- "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t"
+- "movw %[r_b] , #0xFFFF \n\t"
+- "sub %[tmp] , %[tmp] , %[r_b] \n\t"
+- "rsb %[r_c] , %[r_c] , #7 \n\t"
+- "lsl %[tmp] , %[tmp] , %[r_c] \n\t"
+- "add %[low] , %[low] , %[tmp] \n\t"
+- "2: \n\t"
+- : [bit]"=&r"(bit),
+- [low]"+&r"(c->low),
+- [range]"+&r"(c->range),
+- [r_b]"=&r"(reg_b),
+- [r_c]"=&r"(reg_c),
+- [tmp]"=&r"(tmp)
+- : [c]"r"(c),
+- [state]"r"(state),
+- [tables]"r"(ff_h264_cabac_tables),
+- [byte]"M"(offsetof(CABACContext, bytestream)),
+- [end]"M"(offsetof(CABACContext, bytestream_end)),
+- [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
+- [lps_off]"I"(H264_LPS_RANGE_OFFSET),
+- [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+- : "memory", "cc"
+- );
++ "ldrh %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "cmp %[tmp] , %[ptr] \n\t"
++#if CONFIG_THUMB
++ "it cs \n\t"
++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t"
++#endif
++#endif
++ "lsls %[range] , %[low], #16 \n\t"
++ "bne 1f \n\t"
+
+- return bit & 1;
++ "str %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "add %[low] , %[low], %[tmp], lsr #15 \n\t"
++ "movw %[tmp] , 0xFFFF \n\t"
++ "sub %[low] , %[tmp] \n\t"
++ "1: \n\t"
++ "str %[low] , [%[c], %[low_off]] \n\t"
++ : // Outputs
++ [rv]"=&r"(rv),
++ [low]"+r"(low),
++ [range]"=&r"(range),
++ [ptr]"=&r"(ptr),
++ [tmp]"=&r"(tmp)
++ : // Inputs
++ [c]"r"(c),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [end_off]"J"(offsetof(CABACContext, bytestream_end))
++ : // Clobbers
++ "memory", "cc"
++ );
++ return rv;
+ }
++
++
++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
++{
++ uint32_t low = c->low, range, ptr, tmp;
++ __asm volatile (
++ "ldr %[range] , [%[c], %[range_off]] \n\t"
++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "lsl %[low] , #1 \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++ "ldr %[tmp] , [%[c], %[end_off]] \n\t"
++#endif
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "it cs \n\t"
++ "subcs %[low] , %[low], %[range], lsl #17 \n\t"
++ "it cc \n\t"
++ "rsbcc %[rv] , %[rv], #0 \n\t"
++#if UNCHECKED_BITSTREAM_READER
++ "ldrh %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "cmp %[tmp] , %[ptr] \n\t"
++#if CONFIG_THUMB
++ "it cs \n\t"
++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t"
++#endif
++#endif
++ "lsls %[range] , %[low], #16 \n\t"
++ "bne 1f \n\t"
++
++ "str %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "add %[low] , %[low], %[tmp], lsr #15 \n\t"
++ "movw %[tmp] , 0xFFFF \n\t"
++ "sub %[low] , %[tmp] \n\t"
++ "1: \n\t"
++ "str %[low] , [%[c], %[low_off]] \n\t"
++ : // Outputs
++ [rv]"+r"(rv),
++ [low]"+r"(low),
++ [range]"=&r"(range),
++ [ptr]"=&r"(ptr),
++ [tmp]"=&r"(tmp)
++ : // Inputs
++ [c]"r"(c),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [end_off]"J"(offsetof(CABACContext, bytestream_end))
++ : // Clobbers
++ "memory", "cc"
++ );
++ return rv;
++}
++
+ #endif /* HAVE_ARMV6T2_INLINE */
+
+ #endif /* AVCODEC_ARM_CABAC_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_cabac.h
+@@ -0,0 +1,607 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVC_CABAC_H
++#define AVCODEC_ARM_HEVC_CABAC_H
++
++#include "config.h"
++#if HAVE_ARMV6T2_INLINE
++
++#define hevc_mem_bits32 hevc_mem_bits32_arm
++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
++{
++ unsigned int n;
++ __asm__ (
++ "rev %[n], %[x] \n\t"
++ : [n]"=r"(n)
++ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
++ :
++ );
++ return n << (bits & 7);
++}
++
++
++// ---------------------------------------------------------------------------
++//
++// Helper fns - little bits of code where ARM has an instraction that the
++// compiler doesn't know about / use
++
++#define trans_scale_sat trans_scale_sat_arm
++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++ int rv;
++ int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
++
++ __asm__ (
++ "ssat %[rv], #16, %[t], ASR #1 \n\t"
++ : [rv]"=r"(rv)
++ : [t]"r"(t)
++ :
++ );
++ return rv;
++}
++
++#define update_rice update_rice_arm
++static inline void update_rice_arm(uint8_t * const stat_coeff,
++ const unsigned int last_coeff_abs_level_remaining,
++ const unsigned int c_rice_param)
++{
++ int t = last_coeff_abs_level_remaining << 1;
++ __asm__ (
++ "lsrs %[t], %[t], %[shift] \n\t"
++
++ "it eq \n\t"
++ "subeq %[stat], %[stat], #1 \n\t"
++ "cmp %[t], #6 \n\t"
++ "adc %[stat], %[stat], #0 \n\t"
++ "usat %[stat], #8, %[stat] \n\t"
++ : [stat]"+r"(*stat_coeff),
++ [t]"+r"(t)
++ : [shift]"r"(c_rice_param)
++ : "cc"
++ );
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC get loops
++//
++// Where the loop is simple enough we can normally do 10-30% better than the
++// compiler
++
++// Get the residual greater than 1 bits
++
++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
++ uint8_t * const state0)
++{
++ unsigned int i, reg_b, st, tmp, bit, rv;
++ __asm__ (
++ "mov %[i] , #0 \n\t"
++ "mov %[rv] , #0 \n\t"
++ "1: \n\t"
++ "add %[i] , %[i] , #1 \n\t"
++ "cmp %[rv] , #0 \n\t"
++ "ite eq \n\t"
++ "usateq %[st] , #2 , %[i] \n\t"
++ "movne %[st] , #0 \n\t"
++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
++ "and %[tmp] , %[range] , #0xC0 \n\t"
++
++ "ldrb %[bit] , [%[state0], %[st]] \n\t"
++ "add %[r_b] , %[r_b] , %[bit] \n\t"
++ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t"
++ "sub %[range] , %[range] , %[tmp] \n\t"
++
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "ittt ge \n\t"
++ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
++ "movge %[range] , %[tmp] \n\t"
++ "mvnge %[bit] , %[bit] \n\t"
++
++ "clz %[tmp] , %[range] \n\t"
++ "sub %[tmp] , #23 \n\t"
++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
++ "and %[bit] , %[bit] , #1 \n\t"
++ "strb %[r_b] , [%[state0], %[st]] \n\t"
++ "lsl %[low] , %[low] , %[tmp] \n\t"
++ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t"
++ "lsl %[range] , %[range] , %[tmp] \n\t"
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++ "lsls %[tmp] , %[low] , #16 \n\t"
++ "it ne \n\t"
++ "cmpne %[n] , %[i] \n\t"
++ "bne 1b \n\t"
++
++// If reload is not required then we must have run out of flags to decode
++ "tst %[tmp] , %[tmp] \n\t"
++ "bne 2f \n\t"
++
++// Do reload
++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
++ "rbit %[bit] , %[low] \n\t"
++ "movw %[r_b] , #0xFFFF \n\t"
++ "clz %[bit] , %[bit] \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "sub %[bit] , %[bit] , #16 \n\t"
++ "cmp %[n] , %[i] \n\t"
++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++ "lsl %[tmp] , %[tmp] , %[bit] \n\t"
++ "add %[low] , %[low] , %[tmp] \n\t"
++#else
++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t"
++#endif
++
++ "bne 1b \n\t"
++ "2: \n\t"
++ : [bit]"=&r"(bit),
++ [low]"+r"(c->low),
++ [range]"+r"(c->range),
++ [r_b]"=&r"(reg_b),
++ [bptr]"+r"(c->bytestream),
++ [i]"=&r"(i),
++ [tmp]"=&r"(tmp),
++ [st]"=&r"(st),
++ [rv]"=&r"(rv)
++ : [state0]"r"(state0),
++ [n]"r"(n),
++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++ : "memory", "cc"
++ );
++ return rv;
++}
++
++
++// n must be > 0 on entry
++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
++ unsigned int n,
++ const uint8_t * ctx_map,
++ uint8_t * p)
++{
++ unsigned int reg_b, tmp, st, bit;
++ __asm__ (
++// Get bin from map
++#if CONFIG_THUMB
++ "add %[ctx_map] , %[n] \n\t"
++ "ldrb %[st] , [%[ctx_map]] \n\t"
++#else
++ "ldrb %[st] , [%[ctx_map], %[n]]! \n\t"
++#endif
++ "1: \n\t"
++
++// Load state & ranges
++ "ldrb %[bit] , [%[state0], %[st]] \n\t"
++ "and %[tmp] , %[range] , #0xC0 \n\t"
++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
++ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t"
++ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t"
++ "sub %[range] , %[range] , %[tmp] \n\t"
++
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "ittt ge \n\t"
++ "mvnge %[bit] , %[bit] \n\t"
++ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
++ "movge %[range] , %[tmp] \n\t"
++
++// Renorm
++ "clz %[tmp] , %[range] \n\t"
++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
++ "sub %[tmp] , #23 \n\t"
++ "strb %[r_b] , [%[state0], %[st]] \n\t"
++ "tst %[bit] , #1 \n\t"
++ "ldrb %[st] , [%[ctx_map], #-1]! \n\t"
++ "lsl %[low] , %[low] , %[tmp] \n\t"
++// GCC asm seems to need strbne written differently for thumb and arm
++#if CONFIG_THUMB
++ "it ne \n\t"
++ "strbne %[n] , [%[idx]] , #1 \n\t"
++#else
++ "strneb %[n] , [%[idx]] , #1 \n\t"
++#endif
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++ "subs %[n] , %[n] , #1 \n\t"
++ "lsl %[range] , %[range] , %[tmp] \n\t"
++#if CONFIG_THUMB
++ "itt ne \n\t"
++ "lslsne %[tmp] , %[low] , #16 \n\t"
++#else
++ "lslnes %[tmp] , %[low] , #16 \n\t"
++#endif
++ "bne 1b \n\t"
++
++// If we have bits left then n must be 0 so give up now
++ "lsls %[tmp] , %[low] , #16 \n\t"
++ "bne 2f \n\t"
++
++// Do reload
++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
++ "rbit %[bit] , %[low] \n\t"
++ "movw %[r_b] , #0xFFFF \n\t"
++ "clz %[bit] , %[bit] \n\t"
++ "cmp %[n] , #0 \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "sub %[bit] , %[bit] , #16 \n\t"
++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++ "lsl %[tmp] , %[tmp] , %[bit] \n\t"
++ "add %[low] , %[low] , %[tmp] \n\t"
++#else
++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t"
++#endif
++
++// Check to see if we still have more to do
++ "bne 1b \n\t"
++ "2: \n\t"
++ : [bit]"=&r"(bit),
++ [low]"+r"(c->low),
++ [range]"+r"(c->range),
++ [r_b]"=&r"(reg_b),
++ [bptr]"+r"(c->bytestream),
++ [idx]"+r"(p),
++ [n]"+r"(n),
++ [tmp]"=&r"(tmp),
++ [st]"=&r"(st),
++ [ctx_map]"+r"(ctx_map)
++ : [state0]"r"(state0),
++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++ : "memory", "cc"
++ );
++
++ return p;
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC_BY22 functions
++
++
++#define get_cabac_by22_start get_cabac_by22_start_arm
++static inline void get_cabac_by22_start_arm(CABACContext * const c)
++{
++ const uint8_t *ptr = c->bytestream;
++ register uint32_t low __asm__("r1"), range __asm__("r2");
++ uint32_t m, range8, bits;
++#if !USE_BY22_DIV
++ uintptr_t inv;
++#endif
++
++ av_assert2(offsetof (CABACContext, low) == 0);
++ av_assert2(offsetof (CABACContext, range) == 4);
++ av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
++ __asm__ volatile (
++ "ldmia %[c], {%[low], %[range]} \n\t"
++ : // Outputs
++ [low]"=r"(low),
++ [range]"=r"(range)
++ : // Inputs
++ [c]"r"(c)
++ : // Clobbers
++ );
++#if !USE_BY22_DIV
++ inv = (uintptr_t)cabac_by22_inv_range;
++#endif
++ __asm__ volatile (
++ "ldr %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
++#if !USE_BY22_DIV
++ "uxtb %[range8], %[range] \n\t"
++#endif
++ "rbit %[bits], %[low] \n\t"
++ "lsl %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
++ "clz %[bits], %[bits] \n\t"
++ "str %[ptr], [%[c], %[ptr_off]] \n\t"
++ "rev %[m], %[m] \n\t"
++ "rsb %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
++ "eor %[m], %[m], #0x80000000 \n\t"
++#if !USE_BY22_DIV
++ "ldr %[inv], [%[inv], %[range8], lsl #2] \n\t"
++ "pkhbt %[range], %[bits], %[range], lsl #16 \n\t"
++ "str %[range], [%[c], %[bits_off]] \n\t"
++#else
++ "strh %[bits], [%[c], %[bits_off]] \n\t"
++#endif
++#if CONFIG_THUMB
++ "lsr %[m], %[ptr] \n\t"
++ "eor %[range], %[low], %[m] \n\t"
++#else
++ "eor %[range], %[low], %[m], lsr %[ptr] \n\t"
++#endif
++ : // Outputs
++ [ptr]"+&r"(ptr),
++ [low]"+&r"(low),
++ [range]"+&r"(range),
++#if !USE_BY22_DIV
++ [inv]"+&r"(inv),
++#endif
++ [m]"=&r"(m),
++ [range8]"=&r"(range8),
++ [bits]"=&r"(bits)
++ : // Inputs
++ [c]"r"(c),
++ [bits_off]"J"(offsetof (CABACContext, by22.bits)),
++ [ptr_off]"J"(offsetof (CABACContext, bytestream))
++ : // Clobbers
++ "memory"
++ );
++ c->low = range;
++#if !USE_BY22_DIV
++ c->range = inv;
++#endif
++}
++
++#define get_cabac_by22_peek get_cabac_by22_peek_arm
++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
++{
++ uint32_t rv = c->low &~ 1, tmp;
++ __asm__ (
++ "cmp %[inv] , #0 \n\t"
++ "it ne \n\t"
++ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t"
++ : // Outputs
++ [rv]"+r"(rv),
++ [tmp]"=r"(tmp)
++ : // Inputs
++ [inv]"r"(c->range)
++ : // Clobbers
++ "cc"
++ );
++ return rv << 1;
++}
++
++#define get_cabac_by22_flush get_cabac_by22_flush_arm
++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
++{
++ uint32_t bits, ptr, tmp1, tmp2;
++ __asm__ volatile (
++ "ldrh %[bits], [%[cc], %[bits_off]] \n\t"
++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
++ "rsb %[tmp1], %[n], #32 \n\t"
++ "add %[bits], %[bits], %[n] \n\t"
++ "ldrh %[tmp2], [%[cc], %[range_off]] \n\t"
++ "lsr %[tmp1], %[val], %[tmp1] \n\t"
++ "ldr %[val], [%[cc], %[low_off]] \n\t"
++#if CONFIG_THUMB
++ "add %[ptr], %[ptr], %[bits], lsr #3 \n\t"
++ "ldr %[ptr], [%[ptr]] \n\t"
++#else
++ "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
++#endif
++ "mul %[tmp1], %[tmp2], %[tmp1] \n\t"
++ "and %[tmp2], %[bits], #7 \n\t"
++ "strh %[bits], [%[cc], %[bits_off]] \n\t"
++ "rev %[ptr], %[ptr] \n\t"
++ "lsl %[tmp1], %[tmp1], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[val], %[n] \n\t"
++ "sub %[val], %[tmp1] \n\t"
++#else
++ "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t"
++#endif
++ "lsl %[ptr], %[ptr], %[tmp2] \n\t"
++ "orr %[val], %[val], %[ptr], lsr #9 \n\t"
++ "str %[val], [%[cc], %[low_off]] \n\t"
++ : // Outputs
++ [val]"+r"(val),
++ [bits]"=&r"(bits),
++ [ptr]"=&r"(ptr),
++ [tmp1]"=&r"(tmp1),
++ [tmp2]"=&r"(tmp2)
++ : // Inputs
++ [cc]"r"(c),
++ [n]"r"(n),
++ [bits_off]"J"(offsetof(CABACContext, by22.bits)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [range_off]"J"(offsetof(CABACContext, by22.range)),
++ [low_off]"J"(offsetof(CABACContext, low))
++ : // Clobbers
++ "memory"
++ );
++}
++
++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
++{
++ uint32_t last_coeff_abs_level_remaining;
++ uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
++ __asm__ volatile (
++ "ldr %[remain], [%[cc], %[low_off]] \n\t"
++ "ldr %[prefix], [%[cc], %[range_off]] \n\t"
++ "bic %[remain], %[remain], #1 \n\t"
++ "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
++ "cmp %[prefix], #0 \n\t"
++ "it ne \n\t"
++ "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t"
++ "ldrh %[range], [%[cc], %[by22_range_off]] \n\t"
++ "lsl %[remain], %[remain], #1 \n\t"
++ "mvn %[prefix], %[remain] \n\t"
++ "clz %[prefix], %[prefix] \n\t"
++ "rsbs %[n1], %[prefix], #2 \n\t"
++ "bcc 1f \n\t"
++ "adc %[n1], %[rice], %[prefix] \n\t"
++ "add %[tmp2], %[tmp2], %[n1] \n\t"
++ "rsb %[n2], %[n1], #32 \n\t"
++ "and %[tmp1], %[tmp2], #7 \n\t"
++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
++ "lsr %[tmp2], %[tmp2], #3 \n\t"
++ "lsr %[n2], %[remain], %[n2] \n\t"
++ "mul %[n2], %[range], %[n2] \n\t"
++ "ldr %[range], [%[cc], %[low_off]] \n\t"
++ "ldr %[ptr], [%[ptr], %[tmp2]] \n\t"
++ "rsb %[tmp2], %[rice], #31 \n\t"
++ "lsl %[remain], %[remain], %[prefix] \n\t"
++ "lsl %[n2], %[n2], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[range], %[n1] \n\t"
++ "sub %[range], %[n2] \n\t"
++#else
++ "rsb %[range], %[n2], %[range], lsl %[n1] \n\t"
++#endif
++ "rev %[ptr], %[ptr] \n\t"
++ "lsl %[n2], %[prefix], %[rice] \n\t"
++#if CONFIG_THUMB
++ "lsr %[remain], %[tmp2] \n\t"
++ "add %[remain], %[n2] \n\t"
++#else
++ "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t"
++#endif
++ "b 3f \n\t"
++ "1: \n\t"
++ "add %[n2], %[rice], %[prefix], lsl #1 \n\t"
++ "cmp %[n2], %[peek_bits_plus_2] \n\t"
++ "bhi 2f \n\t"
++ "sub %[n1], %[n2], #2 \n\t"
++ "add %[tmp2], %[tmp2], %[n1] \n\t"
++ "rsb %[n2], %[n1], #32 \n\t"
++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
++ "lsr %[tmp1], %[tmp2], #3 \n\t"
++ "lsr %[n2], %[remain], %[n2] \n\t"
++ "mul %[n2], %[range], %[n2] \n\t"
++ "rsb %[range], %[rice], #34 \n\t"
++ "ldr %[ptr], [%[ptr], %[tmp1]] \n\t"
++ "and %[tmp1], %[tmp2], #7 \n\t"
++ "lsl %[remain], %[remain], %[prefix] \n\t"
++ "ldr %[tmp2], [%[cc], %[low_off]] \n\t"
++ "rsb %[prefix], %[prefix], %[range] \n\t"
++ "orr %[remain], %[remain], #0x80000000 \n\t"
++ "rev %[ptr], %[ptr] \n\t"
++ "lsl %[n2], %[n2], #23 \n\t"
++ "mov %[range], #2 \n\t"
++#if CONFIG_THUMB
++ "lsl %[tmp2], %[n1] \n\t"
++ "sub %[tmp2], %[n2] \n\t"
++#else
++ "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t"
++#endif
++ "lsl %[ptr], %[ptr], %[tmp1] \n\t"
++ "lsl %[rice], %[range], %[rice] \n\t"
++ "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t"
++#if CONFIG_THUMB
++ "lsr %[remain], %[prefix] \n\t"
++ "add %[remain], %[rice] \n\t"
++#else
++ "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
++#endif
++ "b 4f \n\t"
++ "2: \n\t"
++ "add %[n1], %[tmp2], %[prefix] \n\t"
++#if CONFIG_THUMB
++ "add %[tmp2], %[ptr], %[n1], lsr #3 \n\t"
++ "ldr %[tmp2], [%[tmp2]] \n\t"
++#else
++ "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t"
++#endif
++ "rsb %[tmp1], %[prefix], #32 \n\t"
++ "push {%[rice]} \n\t"
++ "and %[rice], %[n1], #7 \n\t"
++ "lsr %[tmp1], %[remain], %[tmp1] \n\t"
++ "ldr %[ptr], [%[cc], %[low_off]] \n\t"
++ "mul %[remain], %[range], %[tmp1] \n\t"
++ "rev %[tmp2], %[tmp2] \n\t"
++ "rsb %[n2], %[prefix], %[n2] \n\t"
++ "ldr %[tmp1], [%[cc], %[range_off]] \n\t"
++ "lsl %[rice], %[tmp2], %[rice] \n\t"
++ "sub %[tmp2], %[n2], #2 \n\t"
++ "lsl %[remain], %[remain], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[ptr], %[prefix] \n\t"
++ "rsb %[remain], %[ptr] \n\t"
++#else
++ "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t"
++#endif
++ "orr %[remain], %[remain], %[rice], lsr #9 \n\t"
++ "add %[prefix], %[n1], %[tmp2] \n\t"
++ "bic %[n1], %[remain], #1 \n\t"
++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
++ "cmp %[tmp1], #0 \n\t"
++ "rsb %[rice], %[tmp2], #32 \n\t"
++ "it ne \n\t"
++ "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t"
++ "and %[tmp1], %[prefix], #7 \n\t"
++#if CONFIG_THUMB
++ "add %[ptr], %[ptr], %[prefix], lsr #3 \n\t"
++ "ldr %[ptr], [%[ptr]] \n\t"
++#else
++ "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t"
++#endif
++ "lsl %[n1], %[n1], #1 \n\t"
++ "lsr %[rice], %[n1], %[rice] \n\t"
++ "rsb %[n2], %[n2], #34 \n\t"
++ "mul %[range], %[range], %[rice] \n\t"
++ "pop {%[rice]} \n\t"
++ "rev %[ptr], %[ptr] \n\t"
++ "orr %[n1], %[n1], #0x80000000 \n\t"
++ "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t"
++ "mov %[prefix], #2 \n\t"
++ "lsl %[range], %[range], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[remain], %[tmp2] \n\t"
++ "rsb %[range], %[remain] \n\t"
++#else
++ "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t"
++#endif
++ "lsl %[remain], %[prefix], %[rice] \n\t"
++#if CONFIG_THUMB
++ "lsr %[n1], %[n2] \n\t"
++ "add %[remain], %[n1] \n\t"
++#else
++ "add %[remain], %[remain], %[n1], lsr %[n2] \n\t"
++#endif
++ "3: \n\t"
++ "lsl %[ptr], %[ptr], %[tmp1] \n\t"
++ "orr %[range], %[range], %[ptr], lsr #9 \n\t"
++ "4: \n\t"
++ "str %[range], [%[cc], %[low_off]] \n\t"
++ : // Outputs
++ [remain]"=&r"(last_coeff_abs_level_remaining),
++ [rice]"+r"(rice_param),
++ [prefix]"=&r"(prefix),
++ [n1]"=&r"(n1),
++ [range]"=&r"(range),
++ [n2]"=&r"(n2),
++ [ptr]"=&r"(ptr),
++ [tmp1]"=&r"(tmp1),
++ [tmp2]"=&r"(tmp2)
++ : // Inputs
++ [cc]"r"(c),
++ [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
++ [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream))
++ : // Clobbers
++ "cc", "memory"
++ );
++ return last_coeff_abs_level_remaining;
++}
++
++#endif /* HAVE_ARMV6T2_INLINE */
++
++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
+@@ -0,0 +1,183 @@
++/*
++ * ARM NEON optimised IDCT functions for HEVC decoding
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++@ Included multiple times from hevc_idct_neon.S
++@ Macros defined there
++
++#define DC_SHIFT (15 - BIT_DEPTH)
++#define DC_ADD (1 | (1 << (14 - BIT_DEPTH)))
++#define TRN_SHIFT (20 - BIT_DEPTH)
++
++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r1, #DC_ADD
++ asr r1, #DC_SHIFT
++ vdup.16 q0, r1
++ vdup.16 q1, r1
++ vst1.16 {q0, q1}, [r0]
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r2, r0, #32
++ mov r3, #64
++ add r1, #DC_ADD
++ asr r1, #DC_SHIFT
++ vdup.16 q8, r1
++ vdup.16 q9, r1
++ vst1.16 {q8, q9}, [r0], r3
++ vst1.16 {q8, q9}, [r2], r3
++ vst1.16 {q8, q9}, [r0]
++ vst1.16 {q8, q9}, [r2]
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r2, r0, #32
++ mov r3, #64
++ add r1, #DC_ADD
++ mov ip, #16*16
++ asr r1, #DC_SHIFT
++ vdup.16 q8, r1
++ vdup.16 q9, r1
++1: vst1.16 {q8, q9}, [r0], r3
++ subs ip, ip, #32
++ vst1.16 {q8, q9}, [r2], r3
++ bhi 1b
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r2, r0, #32
++ mov r3, #64
++ add r1, #DC_ADD
++ mov ip, #32*32
++ asr r1, #DC_SHIFT
++ vdup.16 q8, r1
++ vdup.16 q9, r1
++1: vst1.16 {q8, q9}, [r0], r3
++ subs ip, ip, #32
++ vst1.16 {q8, q9}, [r2], r3
++ bhi 1b
++ bx lr
++endfunc
++
++
++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
++ vldr.i32 s0, =0x00240053 // 36 and 83
++ vld1.16 {q14, q15}, [r0 :256] // coeffs
++
++ tr4_shift #7
++
++ vzip.16 d28, d29
++ vzip.16 d30, d31
++ vzip.32 q14, q15
++
++ tr4_shift #TRN_SHIFT
++
++ vst4.16 {q14, q15}, [r0 :256]
++ bx lr
++
++ .ltorg
++endfunc
++
++
++
++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++ vmov.i32 d0, #0x4a // 74
++ vld1.16 {q14, q15}, [r0 :256] // coeffs
++ vmov.i32 d1, #0x1d // 29
++ vmov.i32 d2, #0x37 // 55
++
++ tr4_luma_shift #7
++
++ vzip.16 d28, d29
++ vzip.16 d30, d31
++ vzip.32 q14, q15
++
++ tr4_luma_shift #TRN_SHIFT
++
++ vst4.16 {q14, q15}, [r0 :256]
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
++ add r2, r0, #16
++ adr r3, tr4f
++ vpush {d8-d15}
++ vld1.16 {d0, d1}, [r3]
++ mov r3, #32
++
++ tr8_vert d16, d17, d18, d19, d24, d25, d26, d27, q8, q9, \
++ "sub r0, r0, #128-8", \
++ "sub r2, r2, #128-8", \
++ "cmp r1, #4"
++ ble 2f
++
++ tr8_vert d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
++ "sub r0, r0, #128+8", \
++ "sub r2, r2, #128+8+16-32", \
++ "mov r3, #64"
++
++ vzip.16 d16, d17
++ vzip.16 d18, d19
++
++ vzip.16 d20, d21
++ vzip.16 d22, d23
++ vzip.16 d28, d29
++ vzip.16 d30, d31
++ vzip.32 q10, q11
++ vzip.32 q14, q15
++1:
++ vzip.16 d24, d25
++ vzip.16 d26, d27
++ vzip.32 q8, q9
++ vzip.32 q12, q13
++
++ tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8, q9, TRN_SHIFT
++ tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
++
++ vpop {d8-d15}
++ bx lr
++
++2: vmov.i64 q10, #0
++ sub r0, r0, #8
++ vmov.i64 q11, #0
++ sub r2, r2, #8+16-32
++ vmov.i64 q14, #0
++ mov r3, #64
++ vmov.i64 q15, #0
++
++ vzip.16 d16, d17
++ vzip.16 d18, d19
++
++ b 1b
++
++endfunc
++
++#undef DC_SHIFT
++#undef DC_ADD
++#undef TRN_SHIFT
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.S
+@@ -0,0 +1,267 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ rpi_zap_coeff_vals_neon(
++@ uint16_t * buf, [r0]
++@ unsigned int log_n_m2) [r1]
++
++function rpi_zap_coeff_vals_neon, export=1
++ mov ip, #1
++ vmov.i64 q0, #0
++ teq r1, #0
++ vmov.i64 q1, #0
++ beq 2f
++
++ lsl ip, r1 @ 2, 4 or 8
++ add r2, r0, #32
++ lsl ip, r1 @ 4, 16 or 64 = number of 32-byte blocks to zero
++ mov r3, #64
++1: vst1.8 {q0,q1}, [r0:256], r3
++ subs ip, #2
++ vst1.8 {q0,q1}, [r2:256], r3
++ bne 1b
++ bx lr
++
++2: vst1.8 {q0,q1}, [r0:256]
++ bx lr
++endfunc
++
++@ PIC jump tables are more expensive than absolute for A32 code
++.set jent_pic, CONFIG_PIC || CONFIG_THUMB
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++T .short ((0 + \lab) - (0 + 98b)) / 2
++A .short (0 + \lab) - (4 + 98b)
++.else
++T .word 1 + \lab
++A .word \lab
++.endif
++.endm
++
++.set expected_next, 0
++
++.macro cpy_compound val, p1, p2, drop_thru=0
++.if \p1 + \p2 != \val
++.error "Bad addition! \p1 + \p2 != \val"
++.endif
++.if expected_next != 0 && expected_next != \val
++.error "Drop thru failure"
++.endif
++\val\():
++ push {r0-r3}
++ bl 100\p1\()b
++ pop {r0-r3}
++ add r0, #\p1
++ add r2, #\p1
++.if \drop_thru == 0
++ b \p2\()b
++.set expected_next, 0
++.else
++.set expected_next, \p2
++.endif
++.endm
++
++@ ff_hevc_cpy_blks8x4_neon(
++@ dst [r0]
++@ dst_stride [r1]
++@ src [r2]
++@ src_stride [r3]
++@ width [sp, #0] (bytes)
++@ height) [sp, #4]
++@
++@ Power of 2 widths are directly coded, all others are done in stripes
++@ We expect the vast majority of calls to be power of 2
++@
++@ Currently has min width of 8, but we could make that 4 without issue
++@ Min height is 4
++
++function ff_hevc_rpi_cpy_blks8x4_neon, export=1
++ ldr r12, [sp, #0]
++ push {r11, lr}
++.if jent_pic
++A adr lr, 98f - 2
++.else
++A adr lr, 98f - 4
++.endif
++ lsr r12, #3
++ ldr r11, [sp, #(8 + 4)]
++.if jent_pic
++A lsl r12, #1
++A ldrsh lr, [lr, r12]
++A add pc, lr
++T tbh [pc, r12, lsl #1]
++.else
++ @ A32 only, Thumb is always PIC
++ ldr pc, [lr, r12, lsl #2]
++.endif
++
++98:
++T .short 0 @ unused
++ jent 8f
++ jent 16f
++ jent 24f
++ jent 32f
++ jent 40f
++ jent 48f
++ jent 56f
++ jent 64f
++ jent 72f
++ jent 80f
++ jent 88f
++ jent 96f
++ jent 104f
++ jent 112f
++ jent 120f
++ jent 128f
++
++1008:
++ push {r11, lr}
++8:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {d0 }, [r2], r3
++ vld1.32 {d1 }, [lr], r3
++ vld1.32 {d2 }, [r2], r3
++ vld1.32 {d3 }, [lr], r3
++ subs r11, #4
++ vst1.32 {d0 }, [r0], r1
++ vst1.32 {d1 }, [r12], r1
++ vst1.32 {d2 }, [r0], r1
++ vst1.32 {d3 }, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++10016:
++ push {r11, lr}
++16:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {q0 }, [r2], r3
++ vld1.32 {q1 }, [lr], r3
++ vld1.32 {q2 }, [r2], r3
++ vld1.32 {q3 }, [lr], r3
++ subs r11, #4
++ vst1.32 {q0 }, [r0], r1
++ vst1.32 {q1 }, [r12], r1
++ vst1.32 {q2 }, [r0], r1
++ vst1.32 {q3 }, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++10032:
++ push {r11, lr}
++32:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {q8, q9 }, [r2], r3
++ vld1.32 {q10, q11}, [lr], r3
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #4
++ vst1.32 {q8, q9 }, [r0], r1
++ vst1.32 {q10, q11}, [r12], r1
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++10064:
++ push {r11, lr}
++64:
++ add lr, r2, #32
++ add r12, r0, #32
++1:
++ vld1.32 {q8, q9 }, [r2], r3
++ vld1.32 {q10, q11}, [lr], r3
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #2
++ vst1.32 {q8, q9 }, [r0], r1
++ vst1.32 {q10, q11}, [r12], r1
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++128:
++ push {r4, r5}
++ @ We could do this with fewer registers if we jump around but I
++ @ have a primative urge to load sequentially
++ mov r4, #64
++ add lr, r2, #32
++ add r12, r0, #32
++ sub r3, r4
++ sub r1, r4
++1:
++ vld1.32 {q8, q9 }, [r2], r4
++ vld1.32 {q10, q11}, [lr], r4
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #1
++ vst1.32 {q8, q9 }, [r0], r4
++ vst1.32 {q10, q11}, [r12], r4
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r4, r5, r11, pc}
++
++@ Use drop_thru where we can
++cpy_compound 104, 64, 40, 1
++cpy_compound 40, 32, 8
++
++cpy_compound 112, 64, 48, 1
++cpy_compound 48, 32, 16
++
++cpy_compound 120, 64, 56, 1
++cpy_compound 56, 32, 24, 1
++cpy_compound 24, 16, 8
++
++cpy_compound 72, 64, 8
++cpy_compound 80, 64, 16
++cpy_compound 88, 64, 24
++cpy_compound 96, 64, 32
++
++
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.h
+@@ -0,0 +1,438 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
++#define AVCODEC_ARM_RPI_HEVC_MISC_H
++
++#include "config.h"
++#if HAVE_NEON_INLINE && !CONFIG_THUMB
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_src)
++{
++ const uint8_t *src2 = src + stride_src;
++ stride_src <<= 1;
++ switch (pixel_shift)
++ {
++ case 2:
++ __asm__ volatile (
++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {q0}, [%[dst]]! \n\t"
++ "beq 3f \n\t"
++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {q1}, [%[dst]]! \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.32 {q0}, [%[dst]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.32 {q1}, [%[dst]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [src]"+r"(src),
++ [src2]"+r"(src2),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ case 1:
++ __asm__ volatile (
++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.16 d0, d1 \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d0}, [%[dst]]! \n\t"
++ "beq 3f \n\t"
++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.16 d2, d3 \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d2}, [%[dst]]! \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vzip.16 d0, d1 \n\t"
++ "vst1.16 {d0}, [%[dst]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vzip.16 d2, d3 \n\t"
++ "vst1.16 {d2}, [%[dst]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [src]"+r"(src),
++ [src2]"+r"(src2),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ default:
++ __asm__ volatile (
++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t"
++ "subs %[height], #8 \n\t"
++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.8 d0, d1 \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d0}, [%[dst]]! \n\t"
++ "beq 3f \n\t"
++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.8 d2, d3 \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d2}, [%[dst]]! \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vzip.8 d0, d1 \n\t"
++ "vst1.8 {d0}, [%[dst]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vzip.8 d2, d3 \n\t"
++ "vst1.8 {d2}, [%[dst]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [src]"+r"(src),
++ [src2]"+r"(src2),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst)
++{
++ uint8_t *dst2 = dst + stride_dst;
++ stride_dst <<= 1;
++ switch (pixel_shift)
++ {
++ case 2:
++ __asm__ volatile (
++ "subs %[height], #4 \n\t"
++ "vld1.32 {q0}, [%[src]]! \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.32 {q1}, [%[src]]! \n\t"
++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "beq 3f \n\t"
++ "vld1.32 {q0}, [%[src]]! \n\t"
++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d1[0]}, [%[dst]] \n\t"
++ "vst1.32 {d1[1]}, [%[dst2]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d3[0]}, [%[dst]] \n\t"
++ "vst1.32 {d3[1]}, [%[dst2]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [dst]"+r"(dst),
++ [dst2]"+r"(dst2),
++ [src]"+r"(src),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ case 1:
++ __asm__ volatile (
++ "subs %[height], #4 \n\t"
++ "vld1.16 {d0}, [%[src]]! \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.16 {d2}, [%[src]]! \n\t"
++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "beq 3f \n\t"
++ "vld1.16 {d0}, [%[src]]! \n\t"
++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d0[2]}, [%[dst]] \n\t"
++ "vst1.16 {d0[3]}, [%[dst2]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d2[2]}, [%[dst]] \n\t"
++ "vst1.16 {d2[3]}, [%[dst2]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [dst]"+r"(dst),
++ [dst2]"+r"(dst2),
++ [src]"+r"(src),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ default:
++ __asm__ volatile (
++ "subs %[height], #8 \n\t"
++ "vld1.8 {d0}, [%[src]]! \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.8 {d2}, [%[src]]! \n\t"
++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
++ "beq 3f \n\t"
++ "vld1.8 {d0}, [%[src]]! \n\t"
++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[6]}, [%[dst]] \n\t"
++ "vst1.8 {d0[7]}, [%[dst2]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[6]}, [%[dst]] \n\t"
++ "vst1.8 {d2[7]}, [%[dst2]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [dst]"+r"(dst),
++ [dst2]"+r"(dst2),
++ [src]"+r"(src),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++ int x, y;
++ switch (pixel_shift)
++ {
++ case 2:
++ __asm__ volatile (
++ "ldr %[x], [%[src]], %[stride_src] \n\t"
++ "ldr %[y], [%[src]], %[stride_src] \n\t"
++ "str %[x], [%[dst]], %[stride_dst] \n\t"
++ "sub %[height], #2 \n\t"
++ "1: \n\t"
++ "ldr %[x], [%[src]], %[stride_src] \n\t"
++ "str %[y], [%[dst]], %[stride_dst] \n\t"
++ "ldr %[y], [%[src]], %[stride_src] \n\t"
++ "subs %[height], #2 \n\t"
++ "str %[x], [%[dst]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "str %[y], [%[dst]] \n\t"
++ : // Outputs
++ [x]"=&r"(x),
++ [y]"=&r"(y),
++ [src]"+r"(src),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src),
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ case 1:
++ __asm__ volatile (
++ "ldrh %[x], [%[src]], %[stride_src] \n\t"
++ "ldrh %[y], [%[src]], %[stride_src] \n\t"
++ "strh %[x], [%[dst]], %[stride_dst] \n\t"
++ "sub %[height], #2 \n\t"
++ "1: \n\t"
++ "ldrh %[x], [%[src]], %[stride_src] \n\t"
++ "strh %[y], [%[dst]], %[stride_dst] \n\t"
++ "ldrh %[y], [%[src]], %[stride_src] \n\t"
++ "subs %[height], #2 \n\t"
++ "strh %[x], [%[dst]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "strh %[y], [%[dst]] \n\t"
++ : // Outputs
++ [x]"=&r"(x),
++ [y]"=&r"(y),
++ [src]"+r"(src),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src),
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ default:
++ __asm__ volatile (
++ "ldrb %[x], [%[src]], %[stride_src] \n\t"
++ "ldrb %[y], [%[src]], %[stride_src] \n\t"
++ "strb %[x], [%[dst]], %[stride_dst] \n\t"
++ "sub %[height], #2 \n\t"
++ "1: \n\t"
++ "ldrb %[x], [%[src]], %[stride_src] \n\t"
++ "strb %[y], [%[dst]], %[stride_dst] \n\t"
++ "ldrb %[y], [%[src]], %[stride_src] \n\t"
++ "subs %[height], #2 \n\t"
++ "strb %[x], [%[dst]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "strb %[y], [%[dst]] \n\t"
++ : // Outputs
++ [x]"=&r"(x),
++ [y]"=&r"(y),
++ [src]"+r"(src),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src),
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ }
++}
++
++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++ if (stride_dst == 1 << pixel_shift)
++ ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
++ else if (stride_src == 1 << pixel_shift)
++ ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
++ else
++ ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
++}
++
++#endif /* HAVE_NEON_INLINE */
++
++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_mv_arm.h
+@@ -0,0 +1,93 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
++#define AVCODEC_ARM_RPI_HEVC_MV_H
++
++#if HAVE_ARMV6T2_INLINE
++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
++{
++ MvXY r;
++ __asm__ (
++ "sadd16 %[r], %[a], %[b] \n\t"
++ : [r]"=r"(r)
++ : [a]"r"(a),
++ [b]"r"(b)
++ :
++ );
++ return r;
++}
++#define mvxy_add mvxy_add_arm
++#endif
++
++#if HAVE_ARMV6T2_INLINE
++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
++{
++ int t;
++ __asm__ (
++ "ssat %[td], #8, %[td] \n\t"
++ "ssat %[tb], #8, %[tb] \n\t"
++ "eor %[t], %[td], %[td], asr #31 \n\t"
++ "adds %[t], %[t], %[td], lsr #31 \n\t"
++ "asr %[t], #1 \n\t"
++ "add %[t], #0x4000 \n\t"
++ "it ne \n\t"
++ "sdivne %[t], %[t], %[td] \n\t"
++ "mov %[td], #32 \n\t"
++ "smlabb %[td], %[t], %[tb], %[td] \n\t"
++ "ssat %[td], #13, %[td], asr #6 \n\t"
++ "mov %[tb], #127 \n\t"
++ "smlatb %[t], %[xy], %[td], %[tb] \n\t"
++ "smlabb %[tb], %[xy], %[td], %[tb] \n\t"
++// This takes the sign of x & y for rounding at the "wrong" point
++// (i.e. after adding 127) but for the range of values (-1,-127)
++// where it does the wrong thing you get the right answer (0) anyway
++ "add %[t], %[t], %[t], lsr #31 \n\t"
++ "add %[xy], %[tb], %[tb], lsr #31 \n\t"
++ "ssat %[t], #16, %[t], asr #8 \n\t"
++ "ssat %[xy], #16, %[xy], asr #8 \n\t"
++ "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t"
++ :
++ [t]"=&r"(t),
++ [xy]"+r"(xy),
++ [td]"+r"(td),
++ [tb]"+r"(tb)
++ :
++ :
++ "cc"
++ );
++ return xy;
++}
++#define mv_scale_xy mv_scale_xy_arm
++#endif
++#endif
++
++#endif // AVCODEC_ARM_RPI_HEVC_MV_H
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_arm.h
+@@ -0,0 +1,26 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
++#define AVCODEC_ARM_HEVCDSP_ARM_H
++
++#include "libavcodec/rpi_hevcdsp.h"
++
++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+@@ -0,0 +1,1634 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
++ vsubl.u8 q0, \Q0a, \P0a
++ vsubl.u8 q1, \P1a, \Q1a
++ vdup.16 d4, r2
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
++ vmovl.u8 q2, d4
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
++ vmin.s16 q0, q2
++ vmovl.u8 q2, \Q0a
++ vmax.s16 q0, q1
++ vaddw.u8 q1, q0, \P0a
++ vsub.i16 q0, q2, q0
++ vqmovun.s16 \P0a, q1
++ vqmovun.s16 \Q0a, q0
++.endm
++
++
++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b
++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a
++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vmovl.u8 q2, d4 @ tc0a, tc0b
++ \I3
++ vmovl.u8 q3, d6 @ tc1a, tc1b
++ \I4
++ vmin.s16 q0, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
++ vmin.s16 q1, q3
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vmovl.u8 q2, \Q0a
++ vmax.s16 q1, q3 @ delta0b
++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a
++ vsub.i16 q0, q2, q0 @ q0a - delta0a
++ vmovl.u8 q2, \Q0b
++ vsub.i16 q2, q1 @ q0b - delta0b
++ vaddw.u8 q1, \P0b @ p0b + delta0b
++ vqmovun.s16 \Q0a, q0
++ vqmovun.s16 \P0a, q3
++ vqmovun.s16 \Q0b, q2
++ vqmovun.s16 \P0b, q1
++.endm
++
++
++@ Preserves r12
++@ Clobbers r2
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@ [0..7] tc U a
++@ [8..15] tc V a
++
++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
++ vsub.i16 q0, \Q0a, \P0a
++ vsub.i16 q1, \P1a, \Q1a
++ vdup.16 d4, r2
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
++ vshll.u8 q2, d4, #\bit_depth - 8
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
++ vmin.s16 q0, q2
++ vmov.i16 q2, #0
++ vmax.s16 q0, q1
++ vadd.i16 \P0a, q0
++ vsub.i16 \Q0a, q0
++ vmov.i16 q1, #(1 << \bit_depth) - 1
++ vmax.s16 \P0a, q2
++ vmax.s16 \Q0a, q2
++ vmin.s16 \P0a, q1
++ vmin.s16 \Q0a, q1
++.endm
++
++@ Clobbers r2, r12
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@ [0..7] tc U a
++@ [8..15] tc V a
++@ [16..23] tc U b
++@ [24..31] tc V b
++
++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b
++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a
++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b
++ \I3
++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b
++ \I4
++ vmin.s16 q0, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
++ vmin.s16 q1, q3
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vadd.i16 \P0a, q0 @ p0a + delta0a
++ vsub.i16 \Q0a, q0 @ q0a - delta0a
++ vmax.s16 q1, q3 @ delta0b
++ vadd.i16 \P0b, q1 @ p0b + delta0b
++ vsub.i16 \Q0b, q1 @ q0b - delta0b
++ vmov.i16 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
++ vmax.s16 \P0a, q2
++ vmax.s16 \Q0a, q2
++ vmax.s16 \P0b, q2
++ vmax.s16 \Q0b, q2
++ vmin.s16 \P0a, q3
++ vmin.s16 \Q0a, q3
++ vmin.s16 \P0b, q3
++ vmin.s16 \Q0b, q3
++.endm
++
++
++
++@ uint8_t *_no_p, [sp+0]
++@ uint8_t *_no_q) [sp+4]
++
++.macro hevc_loop_filter_luma_start
++ ldr r12, [r3]
++ ldr r3, [r3, #4]
++ orrs r3, r12, r3, lsl #16
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldrd r4, r5, [sp, #32] @ &_no_p
++ ldrb r4, [r4]
++ ldrb r5, [r5]
++ movs r10, r4
++ it ne
++ movne r10, #1
++ cmp r5, #0
++ it ne
++ orrne r10, #2
++.endm
++
++@ Input:
++@ r2 beta (raw: needs shift for bitdepth > 8)
++@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8)
++@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8)
++@
++@ Input & output
++@ 8-bit: d16-d23 (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
++@ 16-bit: q8-q15
++@
++@ r1 -r1
++@ r10 b1->C, b0->N (r10 junk)
++@
++@ Junks:
++@ r5, r6, r7, r8, r9
++
++.macro m_filter_luma bit_depth, Q11, Q15
++.if \bit_depth == 8
++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
++.endif
++ vadd.i16 q0, q9, \Q11 @ P2 + P0
++.if \bit_depth > 8
++ lsl r3, r3, #(\bit_depth - 8)
++.endif
++ vadd.i16 q1, q14, q12 @ Q2 + Q0
++.if \bit_depth > 8
++ lsl r2, r2, #(\bit_depth - 8)
++.endif
++ vsub.i16 q0, q10 @ P2 - P1 + P0
++ lsr r5, r3, #16
++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0
++.if \bit_depth == 8
++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
++.endif
++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0)
++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0)
++ vmov.i64 q2, #0xffffffff0000
++ vbic q0, q2 @ only dp0(') and dp3(')
++ vbic q1, q2 @ only dq0(') and dq3(')
++ vsra.u64 q0, #16
++ vsra.u64 q1, #16
++ vdup.16 q3, r2 @ beta
++ vdup.16 d14, r3 @ tC[0]
++ vdup.16 d15, r5 @ tC[1]
++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0
++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0
++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3
++ vshl.s16 q6, q7, #2 @ tC[] * 4
++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1
++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta)
++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block)
++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3
++ cmp r7, #0
++ beq .Lbypasswrite
++
++ vcgt.s16 q5, q6, q5 @ if < tc25
++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
++ vand q4, q5
++ vbic d8, d4
++ vbic d9, d4
++ vshr.s16 q3, #2 @ beta_2 = beta >> 2
++ vsra.u64 q4, #16
++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1
++ vshl.i16 q7, #1 @ tc2 = tC[] << 1
++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc
++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half
++ vand d6, d8 @ && beta_2 tests, prime in ms half
++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
++ vneg.s16 q6, q7 @ -tc2
++ vmovn.i32 d8, q3
++ vshrn.i32 d6, q3, #16
++ vand d6, d8
++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3
++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block)
++ vadd.i16 q0, \Q11, q12 @ p0 + q0
++ ands r9, r7, r8
++ beq 1f
++
++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0
++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1
++ lsr r3, r9, #16
++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping)
++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping)
++ vadd.i16 q0, q8, q9 @ p3 + p2
++ vadd.i16 q5, \Q15, q14 @ q2 + q3
++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0
++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2
++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2
++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3
++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
++ vrshr.s16 q0, #3 @ scale, with rounding
++ vrshr.s16 q5, #3
++ vrshr.s16 q1, #2
++ vrshr.s16 q4, #2
++ vrshr.s16 q2, #3
++ vrshr.s16 q3, #3
++ vsub.i16 q0, q9 @ find difference
++ vsub.i16 q5, q14
++ vsub.i16 q1, q10
++ vsub.i16 q4, q13
++ vsub.i16 q2, \Q11
++ vsub.i16 q3, q12
++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2
++ vmax.s16 q5, q6
++ vmax.s16 q1, q6
++ vmax.s16 q4, q6
++ vmax.s16 q2, q6
++ vmax.s16 q3, q6
++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure
++ vdup.16 d13, r3
++ vmin.s16 q0, q7
++ vmin.s16 q5, q7
++ vmin.s16 q1, q7
++ vmin.s16 q4, q7
++ vmin.s16 q2, q7
++ vmin.s16 q3, q7
++ vadd.i16 q0, q9 @ apply difference
++ vadd.i16 q5, q14
++ vadd.i16 q1, q10
++ vadd.i16 q4, q13
++ vadd.i16 q2, \Q11
++ vadd.i16 q3, q12
++ vbit q9, q0, q6 @ apply filtered values according to mask
++ vbit q14, q5, q6
++ vbit q10, q1, q6
++ vbit q13, q4, q6
++ vbit \Q11, q2, q6
++ vbit q12, q3, q6
++ vneg.s16 q6, q7 @ restore -tc2
++
++1:
++ bics r9, r7, r8
++ beq 2f
++
++ vsub.i16 q0, q12, \Q11 @ q0 - p0
++ vsub.i16 q1, q13, q10 @ q1 - p1
++ lsr r3, r9, #16
++ vshl.i16 q2, q0, #3
++ lsr r7, r5, #16
++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0)
++ lsr r8, r6, #16
++ vshl.i16 q2, q1, #1
++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1)
++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1
++ vsub.i16 q5, q3, q4
++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1
++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1
++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1
++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1
++ vmax.s16 q6, q5 @
++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1
++ vdup.16 q0, r2 @ beta
++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc]
++ vshr.s16 q4, #1 @ tc_2 = tc >> 1
++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
++ vshr.s16 q2, q0, #1 @ beta >> 1
++ vadd.i16 q2, q0 @ beta + (beta >> 1)
++ vneg.s16 q0, q4 @ -tc_2
++ vabs.s16 q5, q5 @ abs(original delta0)
++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3
++ vmax.s16 q1, q0
++ vmax.s16 q3, q0
++ vshl.s16 q0, q7, #2 @ 8 * tc
++ vadd.i16 q7, q0 @ 10 * tc
++ vdup.16 d0, r9
++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering
++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
++ vdup.16 d8, r5 @ dp0 + dp3
++ vdup.16 d9, r7 @ dp0' + dp3'
++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0))
++ vdup.16 d10, r6 @ dq0 + dq3
++ vdup.16 d11, r8 @ dq0' + dq3'
++ vand q7, q0 @ AND block and line masks
++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
++ vadd.i16 q0, q1, q10 @ p1 + deltap1
++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
++ vadd.i16 q3, q3, q13 @ q1 + deltaq1
++ vadd.i16 q1, \Q11, q6 @ p0 + delta0
++ vsub.i16 q2, q12, q6 @ q0 - delta0
++ vand q4, q7 @ AND nd_p test with block/line masks
++ vand q5, q7 @ AND nd_q test with block/line masks
++ vbit q10, q0, q4
++ vbit \Q11, q1, q7
++ vbit q12, q2, q7
++ vbit q13, q3, q5
++
++2:
++.if \bit_depth == 8
++ vmovn.i16 d16, q8
++ vmovn.i16 d23, \Q15
++ neg r1, r1
++ vqmovun.s16 d17, q9
++ vqmovun.s16 d18, q10
++ vqmovun.s16 d19, \Q11
++ lsls r10, #31
++ vqmovun.s16 d20, q12
++ vqmovun.s16 d21, q13
++ vqmovun.s16 d22, q14
++.else
++ vmov.i16 q0, #0
++ vmov.i16 q1, #(1 << \bit_depth - 1)
++ @ q8 & q15 should be unaltered and so don't require clipping
++ neg r1, r1
++ vmax.s16 q9, q0
++ vmax.s16 q10, q0
++ vmax.s16 q11, q0
++ vmax.s16 q12, q0
++ vmax.s16 q13, q0
++ vmax.s16 q14, q0
++ lsls r10, #31
++ vmin.s16 q9, q1
++ vmin.s16 q10, q1
++ vmin.s16 q11, q1
++ vmin.s16 q12, q1
++ vmin.s16 q13, q1
++ vmin.s16 q14, q1
++.endif
++ bx lr
++.endm
++
++function hevc_loop_filter_luma_body
++ m_filter_luma 8, q15, q11
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
++@ uint8_t *_pix, [r0]
++@ ptrdiff_t _stride, [r1]
++@ int _beta, [r2]
++@ int *_tc, [r3]
++@ uint8_t *_no_p, [sp+0]
++@ uint8_t *_no_q) [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
++ hevc_loop_filter_luma_start
++
++ sub r4, r0, #4
++ b .Lv_loop_luma_common
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
++@ uint8_t * pix_r, [r0]
++@ ptrdiff_t _stride, [r1]
++@ int _beta, [r2]
++@ int tc2, [r3]
++@ int no_f, [sp+0]
++@ uint8_t * pix_l) [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r4, [sp, #36]
++ ldr r10, [sp, #32]
++
++.Lv_loop_luma_common:
++ vpush {d8-d15}
++
++ @ It's slightly faster to do unlaned loads and transpose in the
++ @ 8-bit case, even though it needs more instructions, because
++ @ VLD4.8 is a really slow way to read from memory.
++ vld1.32 {d16[0]}, [r4:32], r1
++ vld1.32 {d20[0]}, [r0:32], r1
++ vld1.32 {d16[1]}, [r4:32], r1
++ vld1.32 {d20[1]}, [r0:32], r1
++ vld1.32 {d17[0]}, [r4:32], r1
++ vld1.32 {d21[0]}, [r0:32], r1
++ vld1.32 {d17[1]}, [r4:32], r1
++ vld1.32 {d21[1]}, [r0:32], r1
++ vld1.32 {d18[0]}, [r4:32], r1
++ vld1.32 {d22[0]}, [r0:32], r1
++ vld1.32 {d18[1]}, [r4:32], r1
++ vld1.32 {d22[1]}, [r0:32], r1
++ vld1.32 {d19[0]}, [r4:32], r1
++ vld1.32 {d23[0]}, [r0:32], r1
++ vld1.32 {d19[1]}, [r4:32]
++ vld1.32 {d23[1]}, [r0:32]
++ vuzp.16 q8, q9
++ vuzp.16 q10, q11
++ vuzp.8 q8, q9
++ vuzp.8 q10, q11
++ vswp d17, d18
++ vswp d21, d22
++
++ bl hevc_loop_filter_luma_body
++
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
++ @ no_p[1]
++ bmi 1f
++ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
++ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
++
++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
++1:
++ @ no_q[1]
++ bcs 1f
++ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
++ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
++
++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
++1:
++ pop {r4-r10,pc}
++
++.Lbypasswrite:
++ vpop {d8-d15}
++ pop {r4-r10,pc}
++endfunc
++
++.macro m_filter_v_luma_16 bit_depth
++ vpush {d8-d15}
++
++ @ Uses slightly fewer instructions to do laned loads than unlaned
++ @ and transpose. This also means that we can use the same code for
++ @ both split & unsplit deblock
++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++
++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++
++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4]
++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0]
++
++ bl hevc_loop_filter_luma_body_\bit_depth
++
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
++ @ p[1]
++ bmi 1f
++ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
++ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
++ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
++ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6]
++1:
++ @ q[1]
++ bcs 1f
++ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
++ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
++ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
++ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2]
++1:
++ pop {r4-r10,pc}
++.endm
++
++
++
++
++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0]
++@ ptrdiff_t stride, [r1]
++@ int beta, [r2]
++@ int32_t *tc, [r3]
++@ uint8_t *no_p, sp[0]
++@ uint8_t *no_q); sp[4]
++@
++@ Src should always be on 8 byte boundry & all in the same slice
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
++ hevc_loop_filter_luma_start
++ b .Lh_loop_filter_luma_common_8
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r10, [sp, #32]
++
++.Lh_loop_filter_luma_common_8:
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
++ vpush {d8-d15}
++
++ vld1.8 {d16}, [r4], r1
++ vld1.8 {d17}, [r0], r1
++ vld1.8 {d18}, [r4], r1
++ vld1.8 {d19}, [r0], r1
++ vld1.8 {d20}, [r4], r1
++ vld1.8 {d21}, [r0], r1
++ vld1.8 {d22}, [r4]
++ vld1.8 {d23}, [r0]
++
++ bl hevc_loop_filter_luma_body
++
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
++ vpop {d8-d15}
++
++ @ P2-P0
++ bcs 1f
++ vst1.8 {d22}, [r4], r1
++ vst1.8 {d21}, [r6]
++ vst1.8 {d20}, [r4]
++1:
++ @ Q0-Q2
++ bmi 1f
++ vst1.8 {d19}, [r0], r1
++ vst1.8 {d18}, [r2]
++ vst1.8 {d17}, [r0]
++1:
++ pop {r4-r10,pc}
++endfunc
++
++
++.macro m_filter_h_luma_16 bit_depth
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
++ vpush {d8-d15}
++
++ vld1.16 { q8}, [r4], r1
++ vld1.16 { q9}, [r0], r1
++ vld1.16 {q10}, [r4], r1
++ vld1.16 {q11}, [r0], r1
++ vld1.16 {q12}, [r4], r1
++ vld1.16 {q13}, [r0], r1
++ vld1.16 {q14}, [r4]
++ vld1.16 {q15}, [r0]
++
++ bl hevc_loop_filter_luma_body_\bit_depth
++
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
++ vpop {d8-d15}
++
++ @ P2-P0
++ bcs 1f
++ vst1.16 {q14}, [r4], r1
++ vst1.16 {q13}, [r6]
++ vst1.16 {q12}, [r4]
++1:
++ bmi 1f
++ vst1.16 {q11}, [r0], r1
++ vst1.16 {q10}, [r2]
++ vst1.16 { q9}, [r0]
++1:
++ pop {r4-r10,pc}
++.endm
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ unsigned int no_f); // r3
++@
++@ no_f
++@ 0 tl P0
++@ 1 tr P1
++@ 2 bl Q0
++@ 3 br Q1
++@
++@ Probably not worth having the P/Qa only special case in this direction
++@ Given layout we won't save any memory reads or avoid any cache dirtying
++@ We would save a bit of computation but I expect the partials to be less
++@ common in the H direction than V due to how we arrange deblock.
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
++ sub r12, r0, r1
++ cmp r2, #0
++ it eq
++ bxeq lr
++ vld1.8 {d26,d27}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.8 {d18,d19}, [r12], r1
++ vld1.8 {d16,d17}, [r0], r1
++ vld1.8 {d28,d29}, [r12]
++
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
++ "sub r12, r0, r1, asr #1"
++
++ lsls r3, #29 @ b2 -> N, b3 -> C
++ it pl
++ vstrpl d26, [r0, #0]
++ it cc
++ vstrcc d27, [r0, #8]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ it pl
++ vstrpl d18, [r12, #0]
++ it cc
++ vstrcc d19, [r12, #8]
++ bx lr
++
++endfunc
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ unsigned int no_f); // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@
++@ Macro here actual function near bottom
++
++.macro m_filter_h_uv_16 bit_depth
++ sub r12, r0, r1
++ cmp r2, #0
++ it eq
++ bxeq lr
++ vld1.16 {q12, q13}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.16 {q10, q11}, [r12], r1
++ vld1.16 {q8, q9 }, [r0], r1
++ vld1.16 {q14, q15}, [r12]
++
++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
++ "sub r12, r0, r1, asr #1", \
++ "cmp r3, #0"
++
++ bne 1f
++ vst1.16 {q10, q11}, [r12]
++ vst1.16 {q12, q13}, [r0]
++ bx lr
++
++ @ At least one no_f bit is set
++ @ Which means we need to break this apart in an ugly fashion
++1:
++ lsls r3, #29 @ b2 -> N, b3 -> C
++ itt pl
++ vstrpl d24, [r0, #0]
++ vstrpl d25, [r0, #8]
++ itt cc
++ vstrcc d26, [r0, #16]
++ vstrcc d27, [r0, #24]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ itt pl
++ vstrpl d20, [r12, #0]
++ vstrpl d21, [r12, #8]
++ itt cc
++ vstrcc d22, [r12, #16]
++ vstrcc d23, [r12, #24]
++ bx lr
++.endm
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ uint8_t * src_l, // r3
++@ unsigned int no_f); // sp[0]
++@
++@ no_f:
++@ 0 tl P0
++@ 1 tr Q0
++@ 2 bl P1
++@ 3 br Q1
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
++ cmp r2, #0
++ it eq
++ bxeq lr
++ push {lr}
++ vld2.16 {d16[0], d18[0]}, [r3], r1
++ vld2.16 {d20[0], d22[0]}, [r0], r1
++
++ cmp r2, #0x10000
++ vld2.16 {d16[1], d18[1]}, [r3], r1
++ vld2.16 {d20[1], d22[1]}, [r0], r1
++
++ vld2.16 {d16[2], d18[2]}, [r3], r1
++ vld2.16 {d20[2], d22[2]}, [r0], r1
++
++ vld2.16 {d16[3], d18[3]}, [r3], r1
++ vld2.16 {d20[3], d22[3]}, [r0], r1
++ blo 10f
++
++ vld2.16 {d17[0], d19[0]}, [r3], r1
++ vld2.16 {d21[0], d23[0]}, [r0], r1
++
++ sub ip, r0, r3
++ vld2.16 {d17[1], d19[1]}, [r3], r1
++ vld2.16 {d21[1], d23[1]}, [r0], r1
++
++ cmp ip, #4
++ vld2.16 {d17[2], d19[2]}, [r3], r1
++ vld2.16 {d21[2], d23[2]}, [r0], r1
++
++ vld2.16 {d17[3], d19[3]}, [r3]
++ vld2.16 {d21[3], d23[3]}, [r0]
++
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #2", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
++ bne 1f
++
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b
++ vst2.16 {d19[2], d21[2]}, [ip], r1
++ vst2.16 {d19[1], d21[1]}, [r3], r1
++ vst2.16 {d19[0], d21[0]}, [ip], r1
++ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a
++ vst2.16 {d18[2], d20[2]}, [ip], r1
++ vst2.16 {d18[1], d20[1]}, [r3]
++ vst2.16 {d18[0], d20[0]}, [ip]
++ pop {pc}
++
++@ Either split or partial
++1:
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ ittt cs
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
++ bcs 1f
++ @ Q0b
++ vst1.16 {d21[3]}, [r0], r1
++ vst1.16 {d21[2]}, [r2], r1
++ vst1.16 {d21[1]}, [r0], r1
++ vst1.16 {d21[0]}, [r2], r1
++1:
++ ittt mi
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
++ @ P0b
++ vst1.16 {d19[3]}, [r3], r1
++ vst1.16 {d19[2]}, [ip], r1
++ vst1.16 {d19[1]}, [r3], r1
++ vst1.16 {d19[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
++ @ Q0a
++ vst1.16 {d20[3]}, [r0], r1
++ vst1.16 {d20[2]}, [r2], r1
++ vst1.16 {d20[1]}, [r0]
++ vst1.16 {d20[0]}, [r2]
++1:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.16 {d18[3]}, [r3], r1
++ vst1.16 {d18[2]}, [ip], r1
++ vst1.16 {d18[1]}, [r3]
++ vst1.16 {d18[0]}, [ip]
++ pop {pc}
++
++@ Single lump (rather than double)
++10:
++ @ As we have post inced r0/r3 in the load the easiest thing to do is
++ @ to subtract and write forwards, rather than backwards (as above)
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
++ "ldr lr, [sp, #4]", \
++ "add r3, #2", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
++
++ bcs 3f
++ @ Q0a
++ vst1.16 {d20[0]}, [r0], r1
++ vst1.16 {d20[1]}, [r2], r1
++ vst1.16 {d20[2]}, [r0]
++ vst1.16 {d20[3]}, [r2]
++3:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.16 {d18[0]}, [r3], r1
++ vst1.16 {d18[1]}, [ip], r1
++ vst1.16 {d18[2]}, [r3]
++ vst1.16 {d18[3]}, [ip]
++ pop {pc}
++
++endfunc
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ uint8_t * src_l, // r3
++@ unsigned int no_f); // sp[0]
++@
++
++@ no_f
++@ 0 tl P0a
++@ 1 tr Q0a
++@ 2 bl P0b
++@ 3 br Q0b
++
++@ P1: q8, q12
++@ P0: q9, q13
++@ Q0: q10, q14
++@ Q1: q11, q15
++
++.macro m_filter_v_uv2_16 bit_depth
++ cmp r2, #0
++ it eq
++ bxeq lr
++ push {lr}
++ vld2.32 {d16[0], d18[0]}, [r3], r1
++ vld2.32 {d20[0], d22[0]}, [r0], r1
++
++ cmp r2, #0x10000
++ vld2.32 {d16[1], d18[1]}, [r3], r1
++ vld2.32 {d20[1], d22[1]}, [r0], r1
++
++ vld2.32 {d17[0], d19[0]}, [r3], r1
++ vld2.32 {d21[0], d23[0]}, [r0], r1
++
++ vld2.32 {d17[1], d19[1]}, [r3], r1
++ vld2.32 {d21[1], d23[1]}, [r0], r1
++ blo 10f
++
++ vld2.32 {d24[0], d26[0]}, [r3], r1
++ vld2.32 {d28[0], d30[0]}, [r0], r1
++
++ sub ip, r0, r3
++ vld2.32 {d24[1], d26[1]}, [r3], r1
++ vld2.32 {d28[1], d30[1]}, [r0], r1
++
++ cmp ip, #8
++ vld2.32 {d25[0], d27[0]}, [r3], r1
++ vld2.32 {d29[0], d31[0]}, [r0], r1
++
++ vld2.32 {d25[1], d27[1]}, [r3]
++ vld2.32 {d29[1], d31[1]}, [r0]
++
++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #4", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
++ bne 1f
++
++@ Much/most of the time r0 == r3 + 8 and no_f == 0
++@ so it is worth having this special case
++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b
++ vst2.32 {d27[0], d29[0]}, [ip], r1
++ vst2.32 {d26[1], d28[1]}, [r3], r1
++ vst2.32 {d26[0], d28[0]}, [ip], r1
++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a
++ vst2.32 {d19[0], d21[0]}, [ip], r1
++ vst2.32 {d18[1], d20[1]}, [r3]
++ vst2.32 {d18[0], d20[0]}, [ip]
++ pop {pc}
++
++@ Either split or partial
++1:
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ ittt cs
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
++ bcs 1f
++ @ Q0b
++ vst1.32 {d29[1]}, [r0], r1
++ vst1.32 {d29[0]}, [r2], r1
++ vst1.32 {d28[1]}, [r0], r1
++ vst1.32 {d28[0]}, [r2], r1
++1:
++ ittt mi
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
++ @ P0b
++ vst1.32 {d27[1]}, [r3], r1
++ vst1.32 {d27[0]}, [ip], r1
++ vst1.32 {d26[1]}, [r3], r1
++ vst1.32 {d26[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
++ @ Q0a
++ vst1.32 {d21[1]}, [r0], r1
++ vst1.32 {d21[0]}, [r2], r1
++ vst1.32 {d20[1]}, [r0]
++ vst1.32 {d20[0]}, [r2]
++1:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.32 {d19[1]}, [r3], r1
++ vst1.32 {d19[0]}, [ip], r1
++ vst1.32 {d18[1]}, [r3]
++ vst1.32 {d18[0]}, [ip]
++ pop {pc}
++
++@ Single lump (rather than double)
++10:
++ @ As we have post inced r0/r3 in the load the easiest thing to do is
++ @ to subtract and write forwards, rather than backwards (as above)
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "add r3, #4", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
++
++ bcs 3f
++ @ Q0a
++ vst1.32 {d20[0]}, [r0], r1
++ vst1.32 {d20[1]}, [r2], r1
++ vst1.32 {d21[0]}, [r0]
++ vst1.32 {d21[1]}, [r2]
++3:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.32 {d18[0]}, [r3], r1
++ vst1.32 {d18[1]}, [ip], r1
++ vst1.32 {d19[0]}, [r3]
++ vst1.32 {d19[1]}, [ip]
++ pop {pc}
++.endm
++
++
++@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
++@ But in real world testing it is ~20% slower, presumably due to code size
++
++#if 0 // NEON version
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc0, int in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++ mov ip, sp
++ push {a1-a3,v1-v8,lr}
++ ldm ip, {v1-v6}
++ cmp a1, #2
++ bls 2f
++ vpush {d8-d13}
++ sub v5, v5, #10
++ sub v6, v6, #10
++1:
++ vld2.32 {d0[0], d2[0]}, [a3]!
++ vld2.32 {d4[0], d6[0]}, [a4]!
++ vmov.u8 q12, #0
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[0]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[0]}, [ip]
++ vld1.32 {d18[0]}, [v8]
++ vld1.32 {d22[0]}, [lr]
++
++ vld2.32 {d0[1], d2[1]}, [a3]!
++ vld2.32 {d4[1], d6[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d12, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d13, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d27, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[2]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[1]}, [ip]
++ vld1.32 {d18[1]}, [v8]
++ vld1.32 {d22[1]}, [lr]
++
++ vld2.32 {d1[0], d3[0]}, [a3]!
++ vld2.32 {d5[0], d7[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[4]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[4]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[0]}, [ip]
++ vld1.32 {d19[0]}, [v8]
++ vld1.32 {d23[0]}, [lr]
++
++ vld2.32 {d1[1], d3[1]}, [a3]!
++ vld2.32 {d5[1], d7[1]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[6]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[6]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[1]}, [ip]
++ vld1.32 {d19[1]}, [v8]
++ vld1.32 {d23[1]}, [lr]
++
++ @ So now we have:
++ @ q0.32[i] = curr[i].mv[0]
++ @ q1.32[i] = curr[i].mv[1]
++ @ q2.32[i] = neigh[i].mv[0]
++ @ q3.32[i] = neigh[i].mv[1]
++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d24.16[i] = curr[i].pred_flag
++ @ d25.16[i] = neigh[i].pred_flag
++
++ vtst.16 d28, d24, d12
++ vtst.16 d29, d24, d13
++ vadd.i16 d8, d24, d12
++ vadd.i16 d9, d25, d12
++ vtst.16 d30, d25, d12
++ vtst.16 d31, d25, d13
++ veor d26, d8, d9
++ ldr lr, [sp, 6*8 + 1*4]
++ vmovl.s16 q4, d28
++ vmovl.s16 q5, d29
++ teq lr, #1
++ vmovl.s16 q14, d30
++ it ne
++ lslne v1, lr, #1
++ vmovl.s16 q15, d31
++ it ne
++ rsbne v2, v1, #32
++ vbif q0, q1, q4
++ vbif q2, q3, q14
++ vbif q1, q0, q5
++ vbif q3, q2, q15
++ vabd.s16 q12, q0, q2
++ vabd.s16 q2, q1
++ vabd.s16 q0, q3
++ vabd.s16 q1, q3
++ vbif q8, q9, q4
++ vbif q10, q11, q14
++ vbif q9, q8, q5
++ vbif q11, q10, q15
++ vclt.u16 d6, d24, d27
++ vclt.u16 d8, d2, d27
++ vclt.u16 d7, d25, d27
++ vclt.u16 d9, d3, d27
++ vclt.u16 d2, d0, d27
++ vclt.u16 d0, d4, d27
++ vclt.u16 d3, d1, d27
++ vclt.u16 d1, d5, d27
++ vceq.i32 q12, q10, q8
++ vceq.i32 q10, q9
++ vceq.i32 q8, q11
++ vceq.i32 q9, q11
++ vshrn.i32 d6, q3, #8
++ vshrn.i32 d7, q4, #8
++ vshrn.i32 d8, q1, #8
++ vshrn.i32 d9, q0, #8
++ vmovn.i32 d4, q12
++ vmovn.i32 d2, q10
++ vmovn.i32 d3, q8
++ vmovn.i32 d5, q9
++ vand q2, q3
++ vrev16.8 q3, q3
++ vand q2, q3
++ vand q1, q4
++ vrev16.8 q4, q4
++ vand q1, q4
++ vand d4, d5
++ vand d2, d3
++ vbic d0, d12, d4
++ vshr.u16 d26, #2
++ vbic d0, d2
++ vmov.i16 d1, #0x5555
++ vorr d0, d26
++ bne 10f
++
++ @ Merge results into result word, no duplicates
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ lsl a2, #30
++ lsl v8, #30
++ lsl ip, #30
++ lsl lr, #30
++ orr a2, ip, a2, lsr #2
++ orr v8, lr, v8, lsr #2
++ orr a2, v8, a2, lsr #4
++ subs a1, #4
++ orr v7, a2, v7, lsr #8
++ bhi 1b
++
++ mov a1, #32
++ ldr a3, [sp, #6*8]
++ vpop {d8-d13}
++ sub a1, a1, a3, lsl #1
++ mov a1, v7, lsr a1
++ pop {a2-a4,v1-v8,pc}
++10:
++ @ Merge results into result word, with duplicates
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ lsl a2, v2
++ subs a1, #4
++ lsl v8, v2
++ lsl ip, v2
++ lsl lr, v2
++ ldr v2, [sp, #6*8 + 12*4 + 1*4]
++T lsr a2, v1
++T orr a2, ip, a2
++A orr a2, ip, a2, lsr v1
++ lsl ip, v1, #1
++T lsr v8, v1
++T orr v8, lr, v8
++A orr v8, lr, v8, lsr v1
++ lsl lr, v1, #2
++T lsr a2, ip
++T orr a2, v8, a2
++A orr a2, v8, a2, lsr ip
++ ldr v1, [sp, #6*8 + 12*4]
++T lsr v7, lr
++T orr v7, a2, v7
++A orr v7, a2, v7, lsr lr
++ bhi 1b
++
++ mov a1, #32
++ ldrd a3, a4, [sp, #6*8]
++ vpop {d8-d13}
++ mls a1, a3, a4, a1
++ mls a1, a3, a4, a1
++ mov a1, v7, lsr a1
++ pop {a2-a4,v1-v8,pc}
++
++
++2:
++ sub v5, v5, #10
++ sub v6, v6, #10
++ vmov.u8 d16, #0
++ blo 3f
++ vld2.32 {d0[0], d1[0]}, [a3]!
++ vld2.32 {d2[0], d3[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[4]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[0]}, [ip]
++ vld1.32 {d6[0]}, [v8]
++ vld1.32 {d7[0]}, [lr]
++
++3:
++ vld2.32 {d0[1], d1[1]}, [a3]!
++ vld2.32 {d2[1], d3[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d17, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d18, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d19, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[6]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[1]}, [ip]
++ vld1.32 {d6[1]}, [v8]
++ vld1.32 {d7[1]}, [lr]
++
++ @ So now we have:
++ @ d0.32[i] = curr[i].mv[0]
++ @ d1.32[i] = curr[i].mv[1]
++ @ d2.32[i] = neigh[i].mv[0]
++ @ d3.32[i] = neigh[i].mv[1]
++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d16.16[i] = curr[i].pred_flag
++ @ d16.16[2+i] = neigh[i].pred_flag
++
++ vtst.16 d20, d16, d17
++ vtst.16 d22, d16, d18
++ vadd.i16 d30, d16, d17
++ vswp d2, d3
++ ldr lr, [sp, #1*4]
++ vmovl.s16 q10, d20
++ teq lr, #1
++ vmovl.s16 q11, d22
++ it ne
++ lslne v1, lr, #1
++ vbif d0, d1, d20
++ vbif d4, d6, d20
++ vbif d3, d2, d21
++ vbif d5, d7, d21
++ vbif d1, d0, d22
++ vbif d6, d4, d22
++ vbif d2, d3, d23
++ vbif d7, d5, d23
++ vshr.u16 d30, #2
++ vabd.s16 d24, d0, d3
++ vabd.s16 d25, d1, d2
++ vabd.s16 q0, q0, q1
++ vceq.i32 d2, d4, d5
++ vceq.i32 d20, d5, d6
++ vceq.i32 d21, d4, d7
++ vceq.i32 d3, d6, d7
++ vclt.u16 d6, d24, d19
++ vclt.u16 d7, d25, d19
++ vclt.u16 d22, d1, d19
++ vclt.u16 d23, d0, d19
++ vshrn.i32 d6, q3, #8
++ vmovn.i32 d2, q1
++ vshrn.i32 d7, q11, #8
++ vmovn.i32 d3, q10
++ vand q0, q3, q1
++ it ne
++ rsbne v2, v1, #32
++ vrev16.8 q3, q3
++ vand q0, q3
++ vsra.u64 d30, #32
++ vshr.u64 q1, q0, #32
++ vand q0, q1
++ vbic d0, d17, d0
++ vand d30, d30, d17
++ vbic d0, d1
++ vmov.i16 d1, #0x5555
++ vorr d0, d30
++ bne 10f
++
++ @ Construct result word, no duplicates
++ cmp a1, #2
++ vmov.u16 a1, d0[1]
++ vmov.u16 a2, d0[0]
++ it eq
++ orreq a1, a2, a1, lsl #2
++ pop {a2-a4,v1-v8,pc}
++10:
++ @ Construct result word, with duplicates
++ cmp a1, #2
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov.u16 a1, d0[1]
++ lsl a2, #16
++ pkhbt a1, a1, a1, lsl #16
++ lsr a2, v2
++ lsr a1, v2
++T itt eq
++T lsleq a1, v1
++T orreq a1, a2, a1
++A orreq a1, a2, a1, lsl v1
++ pop {a2-a4,v1-v8,pc}
++endfunc
++
++
++
++#else // non-NEON version
++
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc0, in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++ add ip, sp, #4*4
++ push {a2-a4,v1-v8,lr}
++ mov v6, #32
++1: ldmdb ip, {v1-v4}
++ ldrsb v5, [a3, #8] @ curr->ref_idx
++ ldrsb v8, [a3, #9]
++ ldrsb ip, [a4, #8] @ neigh->ref_idx
++ ldrsb lr, [a4, #9]
++ ldr v1, [v1, v5, lsl #2]
++ ldrb v5, [a3, #10] @ curr->pred_flag
++ ldr v2, [v2, v8, lsl #2]
++ ldrb v8, [a4, #10] @ neigh->pred_flag
++ ldr v3, [v3, ip, lsl #2]
++ ldr v4, [v4, lr, lsl #2]
++ teq v5, #3
++ beq 20f
++ teq v8, #3
++ beq 90f
++
++ tst v5, #1
++ itee ne
++ ldrne v5, [a3, #0] @ curr->mv[0]
++ moveq v1, v2
++ ldreq v5, [a3, #4] @ curr->mv[1]
++ tst v8, #1
++ itee ne
++ ldrne v8, [a4, #0] @ neigh->mv[0]
++ moveq v3, v4
++ ldreq v8, [a4, #4] @ neigh->mv[1]
++ teq v1, v3
++ bne 10f
++ ldr lr, =0xFFFCFFFC
++ ssub16 ip, v8, v5
++ ssub16 v5, v5, v8
++ sel v5, v5, ip
++ ands v5, v5, lr
++ @ drop through
++10: it ne
++ movne v5, #1<<30
++11:
++ sub v6, v6, #2
++T mov v7, v7, lsr #2
++ subs a2, a2, #1
++A orr v7, v5, v7, lsr #2
++T orr v7, v5, v7
++ bhi 11b
++
++ ldrd v3, v4, [sp, #16*4]
++ ldr a2, [sp]
++ add ip, sp, #16*4
++ subs a1, a1, #1
++ add a3, a3, v3
++ add a4, a4, v4
++ bhi 1b
++ mov a1, v7, lsr v6
++ pop {a2-a4,v1-v8,pc}
++
++20: teq v8, #3
++ bne 10b
++
++ teq v1, v3
++ it eq
++ teqeq v2, v4
++ bne 40f
++ teq v1, v2
++ bne 30f
++
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
++ ldr lr, =0xFFFCFFFC
++ ssub16 ip, v3, v1
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
++ bne 25f
++ ssub16 ip, v4, v2
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
++ beq 11b
++ @ drop through
++25: ssub16 ip, v4, v1
++ ssub16 v5, v1, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
++ bne 10b
++ ssub16 ip, v3, v2
++ ssub16 v5, v2, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
++ b 10b
++
++30: ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
++ ldr lr, =0xFFFCFFFC
++ ssub16 ip, v3, v1
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
++ bne 10b
++ ssub16 ip, v4, v2
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
++ b 10b
++
++40: teq v1, v4
++ ite eq
++ teqeq v2, v3
++ bne 10b
++
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
++ ldr lr, =0xFFFCFFFC
++ b 25b
++
++90:
++ mov v5, #1<<30
++ b 11b
++endfunc
++
++
++#endif
++
++
++@ =============================================================================
++@
++@ 10 bit
++
++function hevc_loop_filter_luma_body_10
++ m_filter_luma 10, q11, q15
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
++ hevc_loop_filter_luma_start
++ b .Lh_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r10, [sp, #32]
++.Lh_loop_luma_common_10:
++ m_filter_h_luma_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
++ hevc_loop_filter_luma_start
++ sub r4, r0, #8
++ b .Lv_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r4, [sp, #36]
++ ldr r10, [sp, #32]
++
++.Lv_loop_luma_common_10:
++ m_filter_v_luma_16 10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
++ m_filter_h_uv_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
++ m_filter_v_uv2_16 10
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+@@ -0,0 +1,184 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++/* uses registers q8 - q13 for temp values */
++.macro tr4_luma_shift shift
++ vaddl.s16 q8, d28, d30 // c0 = src0 + src2
++ vaddl.s16 q9, d30, d31 // c1 = src2 + src3
++ vsubl.s16 q10, d28, d31 // c2 = src0 - src3
++ vaddl.s16 q11, d28, d31 // src0 + src3
++
++ vmul.i32 q12, q8, d1[0] // 29 * c0
++ vmul.i32 q13, q10, d2[0] // 55 * c2
++ vmul.i32 q8, q8, d2[0] // 55 * c0
++ vmull.s16 q14, d29, d0[0] // c3 = 74 * src1
++
++ vsubw.s16 q11, q11, d30 // src0 - src2 + src3
++ vmla.i32 q12, q9, d2[0] // 29 * c0 + 55 * c1
++ vmls.i32 q13, q9, d1[0] // 55 * c2 - 29 * c1
++ vmla.i32 q8, q10, d1[0] // 55 * c0 + 29 * c2
++
++ vmul.i32 q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
++ vadd.i32 q12, q12, q14 // dst0 = 29 * c0 + 55 * c1 + c3
++ vadd.i32 q13, q13, q14 // dst1 = 55 * c2 - 29 * c1 + c3
++ vsub.i32 q8, q8, q14 // dst3 = 55 * c0 + 29 * c2 - c3
++
++ vqrshrn.s32 d28, q12, \shift
++ vqrshrn.s32 d29, q13, \shift
++ vqrshrn.s32 d30, q11, \shift
++ vqrshrn.s32 d31, q8, \shift
++.endm
++
++/* uses registers q8 - q11 for temp values */
++.macro tr4_shift shift
++ vmull.s16 q9, d29, d0[0] // 83 * src1
++ vmull.s16 q8, d29, d0[1] // 36 * src1
++ vshll.s16 q14, d28, #6 // 64 * src0
++ vshll.s16 q10, d30, #6 // 64 * src2
++ vmlal.s16 q9, d31, d0[1] // 83 * src1 + 36 * src3 o0
++ vmlsl.s16 q8, d31, d0[0] // 36 * src1 - 83 * src3 o1
++ vadd.s32 q11, q14, q10 // 64 * (src0 + src2) e0
++ vsub.s32 q10, q14, q10 // 64 * (src0 - src2) e1
++ vadd.s32 q14, q11, q9 // e0 + o0
++ vadd.s32 q15, q10, q8 // e1 + o1
++ vsub.s32 q8, q10, q8 // e1 - o1
++ vsub.s32 q9, q11, q9 // e0 - o0
++
++ vqrshrn.s32 d28, q14, \shift
++ vqrshrn.s32 d29, q15, \shift
++ vqrshrn.s32 d30, q8, \shift
++ vqrshrn.s32 d31, q9, \shift
++.endm
++
++.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7, \
++ tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
++ tmp1, /* Q reg which doesn't alias with d7 or d0 */ \
++ shift, I1, I2, I3
++
++ vmull.s16 q4, \d1, d1[1] // 89 * src1
++ \I1
++ vmull.s16 q5, \d1, d1[0] // 75 * src1
++ \I2
++ vmull.s16 q6, \d1, d1[3] // 50 * src1
++ \I3
++ vmull.s16 q7, \d1, d1[2] // 18 * src1
++ vmlal.s16 q4, \d3, d1[0] // 75 * src3
++ vmlsl.s16 q5, \d3, d1[2] //-18 * src3
++ vmlsl.s16 q6, \d3, d1[1] //-89 * src3
++ vmlsl.s16 q7, \d3, d1[3] //-50 * src3
++
++ // tr4
++ vmull.s16 q1, \d2, d0[0] // 83 * src(1*2)
++ vmull.s16 q2, \d2, d0[1] // 36 * src(1*2)
++
++ vmlal.s16 q4, \d5, d1[3] // 50 * src5
++ vmlsl.s16 q5, \d5, d1[1] //-89 * src5
++ vmlal.s16 q6, \d5, d1[2] // 18 * src5
++ vmlal.s16 q7, \d5, d1[0] // 75 * src5
++
++ vshll.s16 q3, \d0, #6 // 64 * src(0*2)
++ vshll.s16 \tmp0, \d4, #6 // 64 * src(2*2)
++ vmlal.s16 q1, \d6, d0[1] // 83 * src(1*2) + 36 * src(3*2) o0
++ vmlsl.s16 q2, \d6, d0[0] // 36 * src(1*2) - 83 * src(3*2) o1
++ vadd.i32 \tmp1, q3, \tmp0 // 64 * (src(0*2) + src(2*2)) e0
++ vsub.i32 \tmp0, q3, \tmp0 // 64 * (src(0*2) - src(2*2)) e1
++
++ vmlal.s16 q4, \d7, d1[2] // 18 * src7
++ vmlsl.s16 q5, \d7, d1[3] //-50 * src7
++ vmlal.s16 q6, \d7, d1[0] // 75 * src7
++ vmlsl.s16 q7, \d7, d1[1] //-89 * src7
++
++ vsub.i32 q3, \tmp1, q1 // e0 - o0
++ vadd.i32 \tmp1, \tmp1, q1 // e0 + o0
++ vadd.i32 q1, \tmp0, q2 // e1 + o1
++ vsub.i32 q2, \tmp0, q2 // e1 - o1
++
++ vadd.i32 \tmp0, \tmp1, q4 // e_8[0] + o_8[0], dst[0]
++ vsub.i32 q4, \tmp1, q4 // e_8[0] - o_8[0], dst[7]
++ vsub.i32 \tmp1, q3, q7 // e_8[3] - o_8[3], dst[4]
++ vadd.i32 q7, q3, q7 // e_8[3] + o_8[3], dst[3]
++ vadd.i32 q3, q1, q5 // e_8[1] + o_8[1], dst[1]
++ vsub.i32 q5, q1, q5 // e_8[1] - o_8[1], dst[6]
++ vsub.i32 q1, q2, q6 // e_8[2] - o_8[2], dst[5]
++ vadd.i32 q6, q2, q6 // e_8[2] + o_8[2], dst[2]
++ vqrshrn.s32 \d0, \tmp0, #\shift
++ vqrshrn.s32 \d4, \tmp1, #\shift
++ vqrshrn.s32 \d1, q3, #\shift
++ vqrshrn.s32 \d5, q1, #\shift
++ vqrshrn.s32 \d2, q6, #\shift
++ vqrshrn.s32 \d6, q5, #\shift
++ vqrshrn.s32 \d3, q7, #\shift
++ vqrshrn.s32 \d7, q4, #\shift
++.endm
++
++.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
++ vld1.16 {\d0}, [r0 :64], r3
++ vld1.16 {\d1}, [r2 :64], r3
++ vld1.16 {\d2}, [r0 :64], r3
++ vld1.16 {\d3}, [r2 :64], r3
++ vld1.16 {\d4}, [r0 :64], r3
++ vld1.16 {\d5}, [r2 :64], r3
++ vld1.16 {\d6}, [r0 :64], r3
++ vld1.16 {\d7}, [r2 :64], r3
++
++ tr8_process \
++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++ \q01, \q23, 7, "\I1", "\I2", "\I3"
++.endm
++
++.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
++ tr8_process \
++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++ \q01, \q23, \shift
++
++ vzip.16 \d0, \d4
++ vzip.16 \d1, \d5
++ vzip.16 \d2, \d6
++ vzip.16 \d3, \d7
++ vst4.16 {\d0-\d3}, [r0 :128], r3
++ vst4.16 {\d4-\d7}, [r2 :128], r3
++.endm
++
++#define BIT_DEPTH 8
++#include "rpi_hevc_idct_fn_neon.S"
++
++.text
++
++.align 4
++tr4f:
++.word 0x00240053 // 36 and d1[0] = 83
++.word 0x00000000
++tr8f:
++.word 0x0059004b // 89, d0[0] = 75
++.word 0x00320012 // 50, d0[2] = 18
++tr16:
++.word 0x005a0057 // 90, d2[0] = 87
++.word 0x00500046 // 80, d2[2] = 70
++.word 0x0039002b // 57, d2[0] = 43
++.word 0x00190009 // 25, d2[2] = 9
++
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "rpi_hevc_idct_fn_neon.S"
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+@@ -0,0 +1,32 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++
++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
++{
++ int cpu_flags = av_get_cpu_flags();
++
++ if (have_neon(cpu_flags))
++ ff_hevcdsp_rpi_init_neon(c, bit_depth);
++}
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+@@ -0,0 +1,467 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/bit_depth_template.c"
++
++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
++// have been removed from head as we never use them.
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++ unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++ unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f);
++
++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
++
++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++
++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ int in_inc0, int in_inc1);
++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
++
++
++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++ ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++}
++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++ ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
++}
++
++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++ ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++ ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++
++#if SAO_FILTER_N == 6
++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++ ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++ ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++ ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++ ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++
++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++ ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++ ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height)
++{
++ ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++ ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height)
++{
++ ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++ ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++#endif
++
++
++
++#if RPI_HEVC_SAO_BUF_STRIDE != 160
++#error SAO edge src stride not 160 - value used in .S
++#endif
++
++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
++{
++ if (bit_depth == 8) {
++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8;
++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8;
++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8;
++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8;
++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8;
++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8;
++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8;
++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8;
++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8;
++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8;
++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8;
++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8;
++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8;
++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8;
++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8;
++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8;
++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8;
++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8;
++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8;
++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8;
++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8;
++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8;
++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8;
++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8;
++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8;
++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8;
++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8;
++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8;
++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8;
++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8;
++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8;
++#if SAO_FILTER_N == 6
++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8;
++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8;
++#endif
++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8;
++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8;
++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8;
++
++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8;
++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8;
++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8;
++
++#if SAO_FILTER_N == 6
++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8;
++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8;
++#endif
++ }
++ else if (bit_depth == 10) {
++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10;
++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10;
++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10;
++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10;
++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10;
++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10;
++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10;
++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10;
++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10;
++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10;
++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10;
++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10;
++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10;
++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10;
++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10;
++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10;
++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10;
++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10;
++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10;
++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10;
++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10;
++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10;
++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10;
++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10;
++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10;
++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10;
++
++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10;
++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10;
++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10;
++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10;
++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10;
++#if SAO_FILTER_N == 6
++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10;
++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10;
++#endif
++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10;
++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10;
++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10;
++
++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10;
++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10;
++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10;
++
++#if SAO_FILTER_N == 6
++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10;
++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10;
++#endif
++ }
++
++ assert(offsetof(HEVCRpiMvField, mv) == 0);
++ assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
++ assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
++ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
++ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
++}
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+@@ -0,0 +1,620 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++ vmax.s16 \Q0, \Q_MIN
++ vmax.s16 \Q1, \Q_MIN
++ vmax.s16 \Q2, \Q_MIN
++ vmax.s16 \Q3, \Q_MIN
++ vmin.s16 \Q0, \Q_MAX
++ vmin.s16 \Q1, \Q_MAX
++ vmin.s16 \Q2, \Q_MAX
++ vmin.s16 \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
++ add ip, r0, r2
++ vld1.16 {q10, q11}, [r1]
++ lsl r2, #1
++ vld1.16 {d0}, [r0 :64], r2
++ vld1.16 {d1}, [ip :64], r2
++ vld1.16 {d2}, [r0 :64]
++ vld1.16 {d3}, [ip :64]
++ sub r0, r2
++ vqadd.s16 q0, q10
++ sub ip, r2
++ vqadd.s16 q1, q11
++ vmov.i16 q8, #0
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ vmax.s16 q0, q0, q8
++ vmax.s16 q1, q1, q8
++ vmin.s16 q0, q0, q9
++ vmin.s16 q1, q1, q9
++ vst1.16 {d0}, [r0 :64], r2
++ vst1.16 {d1}, [ip :64], r2
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d3}, [ip :64]
++ bx lr
++
++endfunc
++
++@ add_residual4x4_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
++ add ip, r0, r1
++ vdup.16 q15, r2
++ lsl r1, #1
++ vld1.16 {d0}, [r0 :64], r1
++ vld1.16 {d1}, [ip :64], r1
++ vld1.16 {d2}, [r0 :64]
++ vld1.16 {d3}, [ip :64]
++ sub r0, r1
++ vqadd.s16 q0, q15
++ sub ip, r1
++ vqadd.s16 q1, q15
++ vmov.i16 q8, #0
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ vmax.s16 q0, q0, q8
++ vmax.s16 q1, q1, q8
++ vmin.s16 q0, q0, q9
++ vmin.s16 q1, q1, q9
++ vst1.16 {d0}, [r0 :64], r1
++ vst1.16 {d1}, [ip :64], r1
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d3}, [ip :64]
++ bx lr
++
++endfunc
++
++
++@ add_residual8x8(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
++ mov r3, #8
++ vmov.i64 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++1:
++ vldm r1!, {q10-q13}
++ vld1.16 {q0}, [r0 :128], r2
++ vld1.16 {q1}, [ip :128], r2
++ vld1.16 {q2}, [r0 :128]
++ vld1.16 {q3}, [ip :128]
++ sub r0, r2
++ vqadd.s16 q0, q10
++ sub ip, r2
++ vqadd.s16 q1, q11
++ subs r3, #4
++ vqadd.s16 q2, q12
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0}, [r0 :128], r2
++ vst1.16 {q1}, [ip :128], r2
++ vst1.16 {q2}, [r0 :128], r2
++ vst1.16 {q3}, [ip :128], r2
++ bne 1b
++ bx lr
++
++endfunc
++
++@ add_residual4x4_dc_c(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc_uv) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++ mov r3, #4
++ vdup.32 q15, r2
++ b 9f
++endfunc
++
++@ add_residual8x8_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r2
++ mov r3, #8
++9:
++ vmov.i16 q8, #0
++ add ip, r0, r1
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r1, #1
++1:
++ vld1.16 {q0}, [r0 :128], r1
++ vld1.16 {q1}, [ip :128], r1
++ vld1.16 {q2}, [r0 :128]
++ vld1.16 {q3}, [ip :128]
++ sub r0, r1
++ vqadd.s16 q0, q15
++ sub ip, r1
++ vqadd.s16 q1, q15
++ subs r3, #4
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0}, [r0 :128], r1
++ vst1.16 {q1}, [ip :128], r1
++ vst1.16 {q2}, [r0 :128], r1
++ vst1.16 {q3}, [ip :128], r1
++ bne 1b
++ bx lr
++
++endfunc
++
++@ add_residual16x16(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
++ add ip, r0, r2
++ vmov.i16 q8, #0
++ lsl r2, #1
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ mov r3, #16
++1:
++ vldm r1!, {q10-q13}
++ @ For RPI Sand we could guarantee :256 but not for general
++ @ non-RPI allocation. :128 is as good as we can claim
++ vld1.16 {q0, q1}, [r0 :128]
++ subs r3, #2
++ vld1.16 {q2, q3}, [ip :128]
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q11
++ vqadd.s16 q2, q12
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0, q1}, [r0 :128], r2
++ vst1.16 {q2, q3}, [ip :128], r2
++ bne 1b
++ bx lr
++endfunc
++
++@ add_residual8x8_dc_c(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc_uv) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++ mov r3, #8
++ vdup.32 q15, r2
++ b 9f
++endfunc
++
++@ add_residual16x16_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++ vdup.i16 q15, r2
++ mov r3, #16
++9:
++ vmov.i16 q8, #0
++ add ip, r0, r1
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r1, #1
++1:
++ @ For RPI Sand we could guarantee :256 but not for general
++ @ non-RPI allocation. :128 is as good as we can claim
++ vld1.16 {q0, q1}, [r0 :128]
++ subs r3, #2
++ vqadd.s16 q0, q15
++ vqadd.s16 q1, q15
++ vld1.16 {q2, q3}, [ip :128]
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0, q1}, [r0 :128], r1
++ vst1.16 {q2, q3}, [ip :128], r1
++ bne 1b
++ bx lr
++
++endfunc
++
++
++@ add_residual32x32(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
++ push {lr}
++ mov r3, #32
++ vmov.i16 q8, #0
++ add lr, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vldm r1!, {q10-q13}
++ vldm r0, {q0-q3}
++ vqadd.s16 q0, q10
++ pldw [lr]
++ vqadd.s16 q1, q11
++ add lr, r2
++ vqadd.s16 q2, q12
++ subs r3, #1
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0-q1}, [r0], r2
++ vst1.16 {q2-q3}, [ip], r2
++ bne 1b
++ pop {pc}
++
++endfunc
++
++@ add_residual16x16_dc_c(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc_uv) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++ mov r3, #16
++ vdup.32 q15, r2
++ b 9f
++endfunc
++
++@ add_residual32x32_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r2
++ mov r3, #32
++9:
++ vmov.i16 q8, #0
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vldm r0, {q0-q3}
++ vqadd.s16 q0, q15
++ subs r3, #1
++ vqadd.s16 q1, q15
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0-q1}, [r0], r1
++ vst1.16 {q2-q3}, [ip], r1
++ bne 1b
++ bx lr
++
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_u(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld1.16 {q10, q11}, [r1 :256]
++ lsl r2, #1
++ vld2.16 {d0, d2}, [r0 :128], r2
++ vld2.16 {d1, d3}, [ip :128], r2
++ vld2.16 {d4, d6}, [r0 :128]
++ vld2.16 {d5, d7}, [ip :128]
++ sub r0, r2
++ vmov.i16 q8, #0
++ sub ip, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q15
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++
++ vst2.16 {d0, d2}, [r0 :128], r2
++ vst2.16 {d1, d3}, [ip :128], r2
++ vst2.16 {d4, d6}, [r0 :128]
++ vst2.16 {d5, d7}, [ip :128]
++ bx lr
++endfunc
++
++@ add_residual8x8_u(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ mov r3, #8
++ vmov.i16 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ subs r3, #2
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q15
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ bx lr
++endfunc
++
++@ add_residual16x16_u(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++ push {lr}
++ vdup.16 q15, r3
++ mov r3, #16
++ vmov.i16 q8, #0
++ add lr, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q10
++ pldw [lr]
++ vqadd.s16 q1, q15
++ add lr, r2
++ vqadd.s16 q2, q11
++ subs r3, #1
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {pc}
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld1.16 {q10, q11}, [r1 :256]
++ lsl r2, #1
++ vld2.16 {d0, d2}, [r0 :128], r2
++ vld2.16 {d1, d3}, [ip :128], r2
++ vld2.16 {d4, d6}, [r0 :128]
++ vld2.16 {d5, d7}, [ip :128]
++ sub r0, r2
++ vmov.i16 q8, #0
++ sub ip, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++
++ vqadd.s16 q0, q15
++ vqadd.s16 q1, q10
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q11
++ clip16_4 q0, q1, q2, q3, q8, q9
++
++ vst2.16 {d0, d2}, [r0 :128], r2
++ vst2.16 {d1, d3}, [ip :128], r2
++ vst2.16 {d4, d6}, [r0 :128]
++ vst2.16 {d5, d7}, [ip :128]
++ bx lr
++endfunc
++
++@ add_residual8x8_v(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ mov r3, #8
++ vmov.i16 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ subs r3, #2
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q15
++ vqadd.s16 q1, q10
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q11
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ bx lr
++endfunc
++
++@ add_residual16x16_v(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++ push {lr}
++ vdup.16 q15, r3
++ mov r3, #16
++ vmov.i16 q8, #0
++ add lr, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q15
++ pldw [lr]
++ vqadd.s16 q1, q10
++ add lr, r2
++ vqadd.s16 q2, q15
++ subs r3, #1
++ vqadd.s16 q3, q11
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {pc}
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++ vmov.i16 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++ vldm r1, {q10-q13}
++ vld2.16 {d0, d2}, [r0 :128], r2
++ vld2.16 {d1, d3}, [ip :128], r2
++ vld2.16 {d4, d6}, [r0 :128]
++ vld2.16 {d5, d7}, [ip :128]
++
++ sub r0, r2
++ vqadd.s16 q0, q10
++ sub ip, r2
++ vqadd.s16 q1, q12
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++
++ vst2.16 {d0, d2}, [r0 :128], r2
++ vst2.16 {d1, d3}, [ip :128], r2
++ vst2.16 {d4, d6}, [r0 :128]
++ vst2.16 {d5, d7}, [ip :128]
++ bx lr
++endfunc
++
++@ add_residual8x8_c(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++ push {lr}
++ add ip, r0, r2
++ lsl r2, #1
++ vmov.i16 q8, #0
++ add r3, r1, #(8*8*2) @ Offset to V
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ mov lr, #8
++1:
++ vld1.16 {q10, q11}, [r1 :256]!
++ subs lr, #2
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q12, q13}, [r3 :256]!
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q12
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {pc}
++endfunc
++
++@ add_residual16x16_c(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++ push {r4, lr}
++ vmov.i16 q8, #0
++ add r3, r1, #(16*16*2) @ Offset to V
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++ add r4, r0, r2
++ mov lr, #16
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vld1.16 {q12, q13}, [r3 :256]!
++ vqadd.s16 q0, q10
++ pldw [r4]
++ vqadd.s16 q1, q12
++ add r4, r2
++ vqadd.s16 q2, q11
++ subs lr, #1
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {r4,pc}
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
+@@ -0,0 +1,741 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++@ General notes:
++@
++@ Residual is generally only guaranteed to be clipped to 16 bits.
++@ This means that we do need to do vmovl, vqadd, vqmovun
++@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
++@ with this).
++@
++@ There is an exception for the DC case because its transform is guaranteed
++@ to be small enough that overflow cannot occur during the first add.
++
++@ ============================================================================
++@ Y add
++
++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q0, q1}, [r1]
++ lsl r2, #1
++ vld1.32 d4[0], [r0], r2
++ rsb r3, r2, #0
++ vld1.32 d4[1], [ip], r2
++ vld1.32 d5[0], [r0], r3
++ vld1.32 d5[1], [ip], r3
++ vmovl.u8 q8, d4
++ vmovl.u8 q9, d5
++ vqadd.s16 q0, q8
++ vqadd.s16 q1, q9
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q1
++ vst1.32 d0[0], [r0], r2
++ vst1.32 d0[1], [ip], r2
++ vst1.32 d1[0], [r0]
++ vst1.32 d1[1], [ip]
++ bx lr
++endfunc
++
++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
++ push {r4, lr}
++ vld1.16 {q0, q1}, [r1]!
++ add ip, r0, r2
++ vld1.8 {d6}, [r0]
++ add r4, r0, r2, lsl #1
++ vld1.8 {d7}, [ip]
++ add lr, ip, r2, lsl #1
++ lsl r2, #1
++ mov r3, #8-2
++ vmovl.u8 q2, d6
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++1:
++ vld1.16 {q0, q1}, [r1]!
++ subs r3, #2
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vld1.8 {d6}, [r4], r2
++ vld1.8 {d7}, [lr], r2
++ vst1.8 {d4}, [r0], r2
++ vst1.8 {d5}, [ip], r2
++ vmovl.u8 q2, d6
++ pldw [r4]
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++ bne 1b
++
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vst1.8 {d4}, [r0]
++ vst1.8 {d5}, [ip]
++ pop {r4, pc}
++endfunc
++
++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
++ vld1.16 {q0, q1}, [r1]!
++ add ip, r0, r2
++ vld1.8 {q3}, [r0]
++ mov r3, #16-1
++ vmovl.u8 q2, d6
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++1:
++ vld1.16 {q0, q1}, [r1]!
++ subs r3, #1
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vld1.8 {q3}, [ip], r2
++ vst1.8 {q2}, [r0], r2
++ vmovl.u8 q2, d6
++ pldw [ip]
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++ bne 1b
++
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vst1.8 {q2}, [r0]
++ bx lr
++endfunc
++
++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
++ vldm r1!, {q0-q3}
++ vld1.8 {q8, q9}, [r0]
++ add ip, r0, r2
++ vmovl.u8 q10, d16
++ mov r3, #32-1
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vqadd.s16 q10, q0
++ vqadd.s16 q11, q1
++ vqadd.s16 q12, q2
++ vqadd.s16 q13, q3
++1:
++ vldm r1!, {q0-q3}
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q11
++ vqmovun.s16 d22, q12
++ vqmovun.s16 d23, q13
++ vld1.8 {q8, q9}, [ip], r2
++ subs r3, #1
++ vst1.8 {q10, q11}, [r0], r2
++ vmovl.u8 q10, d16
++ pldw [ip]
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vqadd.s16 q10, q0
++ vqadd.s16 q11, q1
++ vqadd.s16 q12, q2
++ vqadd.s16 q13, q3
++ bne 1b
++
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q11
++ vqmovun.s16 d22, q12
++ vqmovun.s16 d23, q13
++ vst1.8 {q10, q11}, [r0]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
++ add ip, r0, r1
++ vdup.16 q15, r2
++ lsl r1, #1
++ vld1.32 d4[0], [r0], r1
++ rsb r3, r1, #0
++ vld1.32 d4[1], [ip], r1
++ vld1.32 d5[0], [r0], r3
++ vld1.32 d5[1], [ip], r3
++ vaddw.u8 q0, q15, d4
++ vaddw.u8 q1, q15, d5
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q1
++ vst1.32 d0[0], [r0], r1
++ vst1.32 d0[1], [ip], r1
++ vst1.32 d1[0], [r0]
++ vst1.32 d1[1], [ip]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ DC Y or C add
++
++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
++ mov r3, #4-2
++ vdup.32 q15, r2
++ b 1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
++ vdup.16 q15, r2
++ mov r3, #8-2
++1: vld1.8 d16, [r0]
++ add ip, r0, r1
++ push {r4, lr}
++ vld1.8 d17, [ip]
++ add r4, r0, r1, lsl #1
++ vaddw.u8 q0, q15, d16
++ lsl r1, #1
++ vaddw.u8 q1, q15, d17
++ add lr, ip, r1
++1:
++ vld1.8 {d16}, [r4], r1
++ vld1.8 {d17}, [lr], r1
++ subs r3, #2
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vst1.8 {d4}, [r0], r1
++ vst1.8 {d5}, [ip], r1
++ bne 1b
++
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vst1.8 {d4}, [r0]
++ vst1.8 {d5}, [ip]
++ pop {r4, pc}
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
++ mov r3, #8-1
++ vdup.32 q15, r2
++ b 1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
++ vdup.16 q15, r2
++ mov r3, #16-1
++1: vld1.8 {q8}, [r0]
++ add ip, r0, r1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++1:
++ vld1.8 {q8}, [ip], r1
++ subs r3, #1
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vst1.8 {q2}, [r0], r1
++ bne 1b
++
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vst1.8 {q2}, [r0]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
++ mov r3, #16-1
++ vdup.32 q15, r2
++ b 1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
++ vdup.16 q15, r2
++ mov r3, #32-1
++1: vld1.8 {q8, q9}, [r0]
++ add ip, r0, r1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vaddw.u8 q2, q15, d18
++ vaddw.u8 q3, q15, d19
++1:
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d23, q3
++ vld1.8 {q8, q9}, [ip], r1
++ subs r3, #1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vaddw.u8 q2, q15, d18
++ vaddw.u8 q3, q15, d19
++ vst1.8 {q10, q11}, [r0], r1
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d23, q3
++ vst1.8 {q10, q11}, [r0]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc_v) [r3]
++
++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q0, q1}, [r1]
++ lsl r2, #1
++ vld1.8 {d16}, [r0 :64], r2
++ vld1.8 {d17}, [ip :64], r2
++ vld1.8 {d18}, [r0 :64]
++ sub r0, r2
++ vld1.8 {d19}, [ip :64]
++ sub ip, r2
++ vdup.16 q2, r3
++ vdup.16 q3, r3
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vzip.16 q0, q2
++ vzip.16 q1, q3
++ vqadd.s16 q0, q10
++ vqadd.s16 q2, q11
++ vqadd.s16 q1, q12
++ vqadd.s16 q3, q13
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q2
++ vqmovun.s16 d2, q1
++ vqmovun.s16 d3, q3
++ vst1.8 {d0}, [r0 :64], r2
++ vst1.8 {d1}, [ip :64], r2
++ vst1.8 {d2}, [r0 :64]
++ vst1.8 {d3}, [ip :64]
++ bx lr
++endfunc
++
++@ add_residual8x8_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++@ int dc_v) [r3]
++
++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ push {r4, lr}
++ vld2.8 {d16, d17}, [r0 :128]
++ lsl r2, #1
++ vld2.8 {d18, d19}, [ip :128]
++ mov r3, #8-2
++ vld1.16 {q0, q1}, [r1 :256]!
++ add r4, r0, r2
++ vmovl.u8 q10, d16
++ add lr, ip, r2
++ vmovl.u8 q11, d18
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d17
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d19
++1:
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q2
++ vld2.8 {d16, d17}, [r4 :128], r2
++ subs r3, #2
++ vqmovun.s16 d22, q1
++ vqmovun.s16 d23, q3
++ vst2.8 {d20, d21}, [r0 :128], r2
++ vld2.8 {d18, d19}, [lr :128], r2
++ vst2.8 {d22, d23}, [ip :128], r2
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d18
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d17
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d19
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q2
++ vqmovun.s16 d22, q1
++ vqmovun.s16 d23, q3
++ vst2.8 {d20, d21}, [r0 :128]
++ vst2.8 {d22, d23}, [ip :128]
++ pop {r4, pc}
++endfunc
++
++@ add_residual16x16_u(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++@ int dc_v) [r3]
++
++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld2.8 {q8, q9}, [r0 :256]
++ mov r3, #16-1
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q11, d16
++ vmovl.u8 q12, d17
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d18
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d19
++1:
++ vld2.8 {q8, q9}, [ip :256], r2
++ subs r3, #1
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q11
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q12
++ vld1.16 {q0, q1}, [r1 :256]!
++ vst2.8 {q10, q11}, [r0 :256], r2
++ vmovl.u8 q11, d16
++ pldw [ip]
++ vmovl.u8 q12, d17
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d18
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d19
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q11
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q12
++ vst2.8 {q10, q11}, [r0 :256]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q2, q3}, [r1]
++ lsl r2, #1
++ vld1.8 {d16}, [r0 :64], r2
++ vld1.8 {d17}, [ip :64], r2
++ vld1.8 {d18}, [r0 :64]
++ sub r0, r2
++ vld1.8 {d19}, [ip :64]
++ sub ip, r2
++ vdup.16 q0, r3
++ vdup.16 q1, r3
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vzip.16 q0, q2
++ vzip.16 q1, q3
++ vqadd.s16 q0, q10
++ vqadd.s16 q2, q11
++ vqadd.s16 q1, q12
++ vqadd.s16 q3, q13
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q2
++ vqmovun.s16 d2, q1
++ vqmovun.s16 d3, q3
++ vst1.8 {d0}, [r0 :64], r2
++ vst1.8 {d1}, [ip :64], r2
++ vst1.8 {d2}, [r0 :64]
++ vst1.8 {d3}, [ip :64]
++ bx lr
++endfunc
++
++@ add_residual8x8_v(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ push {r4, lr}
++ vld2.8 {d16, d17}, [r0 :128]
++ lsl r2, #1
++ vld2.8 {d18, d19}, [ip :128]
++ mov r3, #8-2
++ vld1.16 {q0, q1}, [r1 :256]!
++ add r4, r0, r2
++ vmovl.u8 q10, d17
++ add lr, ip, r2
++ vmovl.u8 q11, d19
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d16
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d18
++1:
++ vqmovun.s16 d20, q2
++ vqmovun.s16 d21, q0
++ vld2.8 {d16, d17}, [r4 :128], r2
++ subs r3, #2
++ vqmovun.s16 d22, q3
++ vqmovun.s16 d23, q1
++ vst2.8 {d20, d21}, [r0 :128], r2
++ vld2.8 {d18, d19}, [lr :128], r2
++ vst2.8 {d22, d23}, [ip :128], r2
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q10, d17
++ vmovl.u8 q11, d19
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d16
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d18
++ bne 1b
++
++ vqmovun.s16 d20, q2
++ vqmovun.s16 d21, q0
++ vqmovun.s16 d22, q3
++ vqmovun.s16 d23, q1
++ vst2.8 {d20, d21}, [r0 :128]
++ vst2.8 {d22, d23}, [ip :128]
++ pop {r4, pc}
++endfunc
++
++@ add_residual16x16_v(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld2.8 {q8, q9}, [r0 :256]
++ mov r3, #16-1
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q11, d18
++ vmovl.u8 q12, d19
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d16
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d17
++1:
++ vld2.8 {q8, q9}, [ip :256], r2
++ subs r3, #1
++ vqmovun.s16 d20, q11
++ vqmovun.s16 d22, q0
++ vqmovun.s16 d21, q12
++ vqmovun.s16 d23, q1
++ vld1.16 {q0, q1}, [r1 :256]!
++ vst2.8 {q10, q11}, [r0 :256], r2
++ vmovl.u8 q11, d18
++ pldw [ip]
++ vmovl.u8 q12, d19
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d16
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d17
++ bne 1b
++
++ vqmovun.s16 d20, q11
++ vqmovun.s16 d22, q0
++ vqmovun.s16 d21, q12
++ vqmovun.s16 d23, q1
++ vst2.8 {q10, q11}, [r0 :256]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q0, q1}, [r1]! @ all of U
++ lsl r2, #1
++ vld1.8 {d16}, [r0 :64], r2
++ rsb r3, r2, #0
++ vld1.8 {d17}, [ip :64], r2
++ vld1.16 {q2, q3}, [r1] @ all of V
++ vld1.8 {d18}, [r0 :64], r3
++ vld1.8 {d19}, [ip :64], r3
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vzip.16 q0, q2
++ vzip.16 q1, q3
++ vqadd.s16 q0, q10
++ vqadd.s16 q2, q11
++ vqadd.s16 q1, q12
++ vqadd.s16 q3, q13
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q2
++ vqmovun.s16 d2, q1
++ vqmovun.s16 d3, q3
++ vst1.8 {d0}, [r0 :64], r2
++ vst1.8 {d1}, [ip :64], r2
++ vst1.8 {d2}, [r0 :64]
++ vst1.8 {d3}, [ip :64]
++ bx lr
++endfunc
++
++@ add_residual8x8_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
++ vld2.8 {d16, d17}, [r0 :128]
++ add r3, r1, #(8*8*2) @ Offset to V
++ vld1.16 {q0}, [r1 :128]!
++ add ip, r0, r2
++ vld1.16 {q1}, [r3 :128]!
++ vmovl.u8 q10, d16
++ push {lr}
++ vmovl.u8 q8, d17
++ mov lr, #8-1
++ vqadd.s16 q10, q0
++ vqadd.s16 q1, q8
++1:
++ vld2.8 {d16, d17}, [ip :128], r2
++ subs lr, #1
++ vld1.16 {q0}, [r1 :128]!
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q1
++ vld1.16 {q1}, [r3 :128]!
++ vst2.8 {d20, d21}, [r0 :128], r2
++ vmovl.u8 q10, d16
++ pldw [ip]
++ vmovl.u8 q8, d17
++ vqadd.s16 q10, q0
++ vqadd.s16 q1, q8
++ bne 1b
++
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q1
++ vst2.8 {d20, d21}, [r0 :128]
++ pop {pc}
++endfunc
++
++@ add_residual16x16_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
++ vld2.8 {q8, q9}, [r0 :256]
++ add r3, r1, #(16*16*2) @ Offset to V
++ vld1.16 {q0, q1}, [r1 :256]!
++ add ip, r0, r2
++ vld1.16 {q2, q3}, [r3 :256]!
++ vmovl.u8 q10, d16
++ push {lr}
++ vmovl.u8 q8, d17
++ mov lr, #16-1
++ vmovl.u8 q11, d18
++ vmovl.u8 q9, d19
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q8
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q9
++1:
++ vld2.8 {q8, q9}, [ip :256], r2
++ subs lr, #1
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q3
++ vld1.16 {q0, q1}, [r1 :256]!
++ vst2.8 {d20-d23}, [r0 :256], r2
++ vld1.16 {q2, q3}, [r3 :256]!
++ vmovl.u8 q10, d16
++ pldw [ip]
++ vmovl.u8 q8, d17
++ vmovl.u8 q11, d18
++ vmovl.u8 q9, d19
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q8
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q9
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q3
++ vst2.8 {d20-d23}, [r0 :256]
++ pop {pc}
++endfunc
++
++@ 32x32 chroma never occurs so NIF
++
++@ ============================================================================
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+@@ -0,0 +1,2245 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * 2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.set EDGE_SRC_STRIDE, 160
++
++@ PIC jump tables are fractionally more expensive than absolute in our code
++.set jent_pic, CONFIG_PIC
++
++
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
++ vshr.u8 q12, q8, #3
++ \I1
++ vadd.i8 q8, \Q_K128
++ \I2
++ vshr.u8 q13, q9, #3
++ \I3
++ vadd.i8 q9, \Q_K128
++ \I4
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT0, d25
++ vtbl.8 d26, \XLAT1, d26
++ vtbl.8 d27, \XLAT1, d27
++
++ vqadd.s8 q8, q12
++ vshr.u8 q12, q10, #3
++ vadd.i8 q10, \Q_K128
++ vqadd.s8 q9, q13
++ vshr.u8 q13, q11, #3
++ vadd.i8 q11, \Q_K128
++
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT0, d25
++ vtbl.8 d26, \XLAT1, d26
++ vtbl.8 d27, \XLAT1, d27
++ vqadd.s8 q10, q12
++ vsub.i8 q8, \Q_K128
++ vqadd.s8 q11, q13
++ vsub.i8 q9, \Q_K128
++ vsub.i8 q10, \Q_K128
++ vsub.i8 q11, \Q_K128
++.endm
++
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
++ \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vadd.i8 q12, q8, \Q_K128
++ vshr.u8 q8, #3
++ vtbl.8 d16, \XLAT0, d16
++ vtbl.8 d17, \XLAT1, d17
++ vqadd.s8 q12, q8
++ bmi 2f
++1: \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vsub.i8 q13, q12, \Q_K128
++ vadd.i8 q12, q8, \Q_K128
++ vshr.u8 q8, #3
++ \S1
++ \S2
++ \S3
++ \S4
++ vtbl.8 d16, \XLAT0, d16
++ vtbl.8 d17, \XLAT1, d17
++ vqadd.s8 q12, q8
++ bpl 1b
++2: vsub.i8 q13, q12, \Q_K128
++ \S1
++ \S2
++ \S3
++ \S4
++.endm
++
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++ vmax.s16 \Q0, \Q_MIN
++ vmax.s16 \Q1, \Q_MIN
++ vmax.s16 \Q2, \Q_MIN
++ vmax.s16 \Q3, \Q_MIN
++ vmin.s16 \Q0, \Q_MAX
++ vmin.s16 \Q1, \Q_MAX
++ vmin.s16 \Q2, \Q_MAX
++ vmin.s16 \Q3, \Q_MAX
++.endm
++
++@ Clobbers q12, q13
++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
++ vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++ vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++ vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++ \I1
++ vtbl.8 d24, \XLAT0, d24
++ vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++ vtbl.8 d25, \XLAT1, d25
++ \I2
++ vtbl.8 d26, \XLAT0, d26
++ vtbl.8 d27, \XLAT1, d27
++ vaddw.s8 \Q0, d24
++ vaddw.s8 \Q1, d25
++ vaddw.s8 \Q2, d26
++ vaddw.s8 \Q3, d27
++ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
++
++@ Clobbers q10, q11, q12
++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
++ \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vshrn.i16 d24, \Q0, #\bit_depth - 5
++ vshrn.i16 d25, \Q1, #\bit_depth - 5
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT1, d25
++ vaddw.s8 q10, \Q0, d24
++ vaddw.s8 q11, \Q1, d25
++ bmi 2f
++1: \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vmax.s16 q10, \Q_MIN
++ vmax.s16 q11, \Q_MIN
++ vshrn.i16 d24, \Q0, #\bit_depth - 5
++ vshrn.i16 d25, \Q1, #\bit_depth - 5
++ vmin.s16 q10, \Q_MAX
++ vmin.s16 q11, \Q_MAX
++ \S1
++ \S2
++ \S3
++ \S4
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT1, d25
++ vaddw.s8 q10, \Q0, d24
++ vaddw.s8 q11, \Q1, d25
++ bpl 1b
++2: vmax.s16 q10, \Q_MIN
++ vmax.s16 q11, \Q_MIN
++ vmin.s16 q10, \Q_MAX
++ vmin.s16 q11, \Q_MAX
++ \S1
++ \S2
++ \S3
++ \S4
++.endm
++
++
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
++
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
++
++@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
++function band_load_y
++ ldr ip, [sp, #16] @ &sao_offset_val[0]
++ ldr r4, [sp, #20] @ sao_left_class
++ vmov.i64 d4, #0
++ vmov.i64 q0, #0
++ pld [r1]
++ vld2.8 {q8}, [ip]
++ sub ip, sp, #8*5
++ vmov.i64 q1, #0
++ add r4, ip, r4
++ vpush {d0-d4} @ Put zero array on stack
++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1]
++ ldr ip, [ip, #8*5 + 28] @ height
++ vst1.32 {d16[0]}, [r4]
++ add r4, r1, r3
++ vpop {d0-d4} @ Pop modified array
++ sub ip, ip, #1
++ vorr d0, d0, d4
++ bx lr
++endfunc
++
++@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
++function band_load_c
++ ldr ip, [sp, #16] @ &sao_offset_val1[0]
++ ldr r4, [sp, #20] @ sao_left_class1
++ vmov.i64 d24, #0
++ vmov.i64 q10, #0
++ pld [r1]
++ vld2.8 {q8}, [ip]
++ sub ip, sp, #8*5
++ vmov.i64 q11, #0
++ add r4, ip, r4
++ ldr ip, [sp, #24] @ &sao_offset_val2[0]
++ vpush {d20-d24} @ Put zero array on stack
++ vld2.8 {q9}, [ip]
++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1]
++ ldr ip, [sp, #8*5 + 28] @ sao_left_class2
++ vst1.32 {d16[0]}, [r4]
++ add ip, sp, ip
++ vshr.u64 d18, d18, #8 @ 1st interesting val is [1]
++ vldmia sp, {d0-d3} @ Load modified array
++ vldr d16, [sp, #8*4]
++ add r4, r1, r3
++ vstmia sp, {d20-d24} @ Put zero array on stack (again)
++ vst1.32 {d18[0]}, [ip]
++ vorr d0, d0, d16
++ vldmia sp, {d4-d7} @ Load modified array
++ vldr d18, [sp, #8*4]
++ ldr ip, [sp, #8*5 + 36] @ height
++ add sp, sp, #8*5
++ vorr d4, d4, d18
++ sub ip, ip, #1
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_64_neon_8, export=1
++ push {r4-r6, lr}
++ vmov.u8 q15, #128
++ bl band_load_y
++
++1: vldmia r1, {q8-q11}
++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
++ "pld [r4]", \
++ "subs ip, #1", \
++ "it ne; addne r4, r3", \
++ "add r1, r3"
++ vstmia r0, {q8-q11}
++ add r0, r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_32_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.u8 q15, #128
++ bl band_load_y
++
++1: vld1.8 { q8, q9 }, [r1, :128], r3
++ subs ip, #2
++ vld1.8 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++ vst1.8 { q8, q9 }, [r0, :128], r2
++ vst1.8 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_16_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.u8 q15, #128
++ bl band_load_y
++
++1: vld1.8 { q8}, [r1, :128], r3
++ subs ip, #4
++ vld1.8 { q9}, [r6, :128], r3
++ vld1.8 {q10}, [r1, :128], r3
++ vld1.8 {q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++ vst1.8 { q8}, [r0, :128], r2
++ vst1.8 { q9}, [r5, :128], r2
++ vst1.8 {q10}, [r0, :128], r2
++ vst1.8 {q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_8_neon_8, export=1
++ ldr ip, [sp, #8] @ width
++ push {r4-r6, lr}
++ vmov.u8 q15, #128
++ cmp ip, #8
++ bl band_load_y
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ blt 4f
++
++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
++ "vld1.8 {d16}, [r1, :64], r3", \
++ "subs ip, #2", \
++ "vld1.8 {d17}, [r6, :64], r3", \
++ "", \
++ "", \
++ "vst1.8 {d26}, [r0, :64], r2", \
++ "vst1.8 {d27}, [r5, :64], r2"
++ pop {r4-r6, pc}
++4:
++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
++ "vld1.32 {d16[0]}, [r1, :32], r3", \
++ "subs ip, #4", \
++ "vld1.32 {d16[1]}, [r6, :32], r3", \
++ "vld1.32 {d17[0]}, [r1, :32], r3", \
++ "vld1.32 {d17[1]}, [r6, :32], r3", \
++ "vst1.32 {d26[0]}, [r0, :32], r2", \
++ "vst1.32 {d26[1]}, [r5, :32], r2", \
++ "vst1.32 {d27[0]}, [r0, :32], r2", \
++ "vst1.32 {d27[1]}, [r5, :32], r2"
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_32_neon_8(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, #32
++ add r6, r1, #32
++ vmov.u8 q15, #128
++ bl band_load_c
++
++1: vld2.8 { q8, q9 }, [r1, :128], r3
++ subs ip, #1
++ vld2.8 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
++ "pld [r4]", \
++ "it ne; addne r4, r3"
++
++ vst2.8 { q8, q9 }, [r0, :128], r2
++ vst2.8 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_16_neon_8(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.u8 q15, #128
++ bl band_load_c
++
++1: vld2.8 { q8, q9 }, [r1, :128], r3
++ subs ip, #2
++ vld2.8 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15
++
++ vst2.8 { q8, q9 }, [r0, :128], r2
++ vst2.8 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_8_neon_8(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
++ ldr ip, [sp, #16] @ width
++ push {r4-r6, lr}
++ vmov.u8 q15, #128
++ cmp ip, #8
++ bl band_load_c
++ blt 4f
++
++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
++ "vld2.8 {d16-d17}, [r1, :128], r3", \
++ "subs ip, #1", \
++ "", \
++ "", \
++ "", \
++ "vst2.8 {d26-d27}, [r0, :128], r2"
++ pop {r4-r6, pc}
++4:
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
++ "vld1.8 {d16}, [r1, :64], r3", \
++ "subs ip, #2", \
++ "vld1.8 {d17}, [r6, :64], r3", \
++ "vuzp.8 d16, d17", \
++ "", \
++ "vzip.8 d26, d27", \
++ "vst1.8 {d26}, [r0, :64], r2", \
++ "vst1.8 {d27}, [r5, :64], r2"
++ pop {r4-r6, pc}
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_64_16 bit_depth
++ push {r4-r6, lr}
++ vmov.i64 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
++ bl band_load_y
++ vpush {q4-q7}
++
++1: vldm r1, {q4-q11}
++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++ "subs ip, #1", \
++ "add r1, r3"
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
++ vstm r0, {q4-q11}
++ add r0, r2
++ bpl 1b
++
++ vpop {q4-q7}
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_64_neon_10, export=1
++ band_64_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_32_16 bit_depth
++ push {r4-r6, lr}
++ vmov.i64 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
++ bl band_load_y
++
++1: vldm r1, {q8-q11}
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++ "subs ip, #1", \
++ "add r1, r3"
++ vstm r0, {q8-q11}
++ add r0, r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_32_neon_10, export=1
++ band_32_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_16_16 bit_depth
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.i64 q14, #0
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_y
++
++1: vld1.16 { q8, q9 }, [r1, :128], r3
++ subs r12, #2
++ vld1.16 {q10, q11}, [r6, :128], r3
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
++ vst1.16 { q8, q9 }, [r0, :128], r2
++ vst1.16 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_16_neon_10, export=1
++ band_16_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_8_16 bit_depth
++ ldr ip, [sp, #8] @ width
++ push {r4-r6, lr}
++ vmov.i64 q14, #0
++ cmp ip, #8
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_y
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ blt 4f
++
++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++ "vld1.16 {q8}, [r1, :128], r3", \
++ "subs ip, #2", \
++ "vld1.16 {q9}, [r6, :128], r3", \
++ "", \
++ "", \
++ "vst1.16 {q10}, [r0, :128], r2", \
++ "vst1.16 {q11}, [r5, :128], r2"
++ pop {r4-r6, pc}
++4:
++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++ "vld1.16 {d16}, [r1, :64], r3", \
++ "subs ip, #4", \
++ "vld1.16 {d17}, [r6, :64], r3", \
++ "vld1.16 {d18}, [r1, :64], r3", \
++ "vld1.16 {d19}, [r6, :64], r3", \
++ "vst1.16 {d20}, [r0, :64], r2", \
++ "vst1.16 {d21}, [r5, :64], r2", \
++ "vst1.16 {d22}, [r0, :64], r2", \
++ "vst1.16 {d23}, [r5, :64], r2"
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_8_neon_10, export=1
++ band_8_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_32_neon_10(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++.macro band_c_32_16 bit_depth
++ push {r4-r6, lr}
++ add r5, r0, #32
++ add r6, r1, #32
++ sub r2, #64
++ sub r3, #64
++ vmov.i64 q14, #0
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_c
++ mov lr, #64
++ vpush {q4-q7}
++
++1: vld2.16 { q4, q5 }, [r1, :128], lr
++ subs ip, #1
++ vld2.16 { q6, q7 }, [r6, :128], lr
++ vld2.16 { q8, q9 }, [r1, :128], r3
++ vld2.16 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++ "pld [r4]", \
++ "it ne; addne r4, r3"
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++ vst2.16 { q4, q5 }, [r0, :128], lr
++ vst2.16 { q6, q7 }, [r5, :128], lr
++ vst2.16 { q8, q9 }, [r0, :128], r2
++ vst2.16 {q10, q11}, [r5, :128], r2
++
++ bpl 1b
++
++ vpop {q4-q7}
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
++ band_c_32_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_16_neon_10(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++.macro band_c_16_16 bit_depth
++ push {r4-r6, lr}
++ add r5, r0, #32
++ add r6, r1, #32
++ vmov.i64 q14, #0
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_c
++
++1: vld2.16 { q8, q9 }, [r1, :128], r3
++ subs ip, #1
++ vld2.16 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++ vst2.16 { q8, q9 }, [r0, :128], r2
++ vst2.16 {q10, q11}, [r5, :128], r2
++
++ bpl 1b
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
++ band_c_16_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_8_neon_10(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++.macro band_c_8_16 bit_depth
++ ldr ip, [sp, #16] @ width
++ push {r4-r6, lr}
++ vmov.i64 q14, #0
++ cmp ip, #8
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_c
++ blt 4f
++
++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++ "vld2.16 {q8,q9}, [r1, :128], r3", \
++ "subs ip, #1", \
++ "", \
++ "", \
++ "", \
++ "vst2.16 {q10,q11}, [r0, :128], r2"
++ pop {r4-r6, pc}
++4:
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++ "vld2.16 {d16,d18}, [r1, :128], r3", \
++ "subs ip, #2", \
++ "vld2.16 {d17,d19}, [r6, :128], r3", \
++ "", \
++ "", \
++ "vst2.16 {d20,d22}, [r0, :128], r2", \
++ "vst2.16 {d21,d23}, [r5, :128], r2"
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
++ band_c_8_16 10
++endfunc
++
++
++@ =============================================================================
++@ SAO EDGE
++
++@ r0 destination address
++@ r2 stride to post-increment r0 with
++@ [r5] translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27. For Y d26=d27
++
++function edge_64b_body_8
++
++ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0
++ vcgt.u8 q13, q5, q1
++ vcgt.u8 q14, q6, q2
++ vcgt.u8 q15, q7, q3
++
++ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0
++ vcgt.u8 q1, q5
++ vcgt.u8 q2, q6
++ vcgt.u8 q3, q7
++
++ vsub.s8 q0, q12 @ a = sign(c-a)
++ vsub.s8 q1, q13
++ vsub.s8 q2, q14
++ vsub.s8 q3, q15
++
++ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0
++ vcgt.u8 q13, q5, q9
++ vcgt.u8 q14, q6, q10
++ vcgt.u8 q15, q7, q11
++
++ vsub.s8 q0, q12
++ vsub.s8 q1, q13
++ vsub.s8 q2, q14
++ vsub.s8 q3, q15
++
++ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0
++ vcgt.u8 q13, q9, q5
++ vcgt.u8 q14, q10, q6
++ vcgt.u8 q15, q11, q7
++
++ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b)
++ vadd.s8 q1, q13
++ vmov.u8 q12, #2
++ vadd.s8 q2, q14
++ vadd.s8 q3, q15
++
++ vadd.s8 q0, q12
++ vadd.s8 q1, q12
++
++ vld1.8 {d26, d27}, [r5]
++
++ vadd.s8 q2, q12
++ vuzp.8 q0, q1
++ vmov.u8 q15, #128
++ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b)
++
++ vtbl.8 d0, {d26}, d0
++ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add
++
++ vtbl.8 d1, {d26}, d1
++ vadd.s8 q14, q5, q15
++
++ vtbl.8 d2, {d27}, d2
++ vuzp.8 q2, q3
++
++ vtbl.8 d3, {d27}, d3
++
++ vtbl.8 d4, {d26}, d4
++ vzip.8 q0, q1
++
++ vtbl.8 d5, {d26}, d5
++ vqadd.s8 q0, q12
++ vqadd.s8 q1, q14
++ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add
++
++ vtbl.8 d6, {d27}, d6
++ vtbl.8 d7, {d27}, d7
++ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add
++ vzip.8 q2, q3
++
++ vsub.s8 q0, q15
++ vqadd.s8 q2, q12
++ vqadd.s8 q3, q14
++ vsub.s8 q1, q15
++ vsub.s8 q2, q15
++ vsub.s8 q3, q15
++
++ bx lr
++endfunc
++
++@ r0 destination address
++@ r2 stride to post-increment r0 with
++@ r4 upper clip value
++@ [r5] translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27. For Y d26=d27
++
++function edge_64b_body_16
++
++ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0
++ vcgt.u16 q13, q5, q1
++ vcgt.u16 q14, q6, q2
++ vcgt.u16 q15, q7, q3
++
++ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0
++ vcgt.u16 q1, q1, q5
++ vcgt.u16 q2, q2, q6
++ vcgt.u16 q3, q3, q7
++
++ vsub.s16 q0, q0, q12 // a = sign(c-a)
++ vsub.s16 q1, q1, q13
++ vsub.s16 q2, q2, q14
++ vsub.s16 q3, q3, q15
++
++ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0
++ vcgt.u16 q13, q5, q9
++ vcgt.u16 q14, q6, q10
++ vcgt.u16 q15, q7, q11
++
++ vsub.s16 q0, q0, q12
++ vsub.s16 q1, q1, q13
++ vsub.s16 q2, q2, q14
++ vsub.s16 q3, q3, q15
++
++ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0
++ vcgt.u16 q13, q9, q5
++ vcgt.u16 q14, q10, q6
++ vcgt.u16 q15, q11, q7
++
++ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b)
++ vadd.s16 q1, q1, q13
++ vadd.s16 q2, q2, q14
++ vadd.s16 q3, q3, q15
++
++ vmov.u8 q12, #2
++
++ vmovn.s16 d0, q0
++ vmovn.s16 d1, q1
++ vmovn.s16 d2, q2
++ vmovn.s16 d3, q3
++
++ vldr d26, [r5]
++
++ vuzp.8 q0, q1
++
++ vldr d27, [r5, #8]
++
++ vadd.s8 q0, q0, q12
++ vadd.s8 q1, q1, q12
++
++ vmov.i64 q12, #0
++
++ vtbl.8 d0, {d26}, d0
++ vtbl.8 d1, {d26}, d1
++ vtbl.8 d2, {d27}, d2
++ vtbl.8 d3, {d27}, d3
++
++ vdup.i16 q13, r4
++
++ vzip.8 q0, q1
++
++ @ Avoid overwrite whilst widening
++ vaddw.s8 q2, q6, d2
++ vaddw.s8 q3, q7, d3
++ vaddw.s8 q1, q5, d1
++ vaddw.s8 q0, q4, d0
++
++ @ now clip
++ clip16_4 q2, q3, q1, q0, q12, q13
++
++ bx lr
++endfunc
++
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3, q9, q10
++@
++@ d16, d17 (q8) xlat U, V
++@ q14.u8 #2
++@ q15.u8 #128
++
++function edge_16b_body_8
++ vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0
++ vadd.u8 q9, q14, q9
++ vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0
++ vsub.u8 q9, q9, q0
++ vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0
++ vadd.u8 q9, q9, q0
++ vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0
++ vsub.u8 q0, q9, q0
++
++ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add
++
++ vuzp.8 d0, d1
++
++ vtbl.8 d0, {d16}, d0
++ vtbl.8 d1, {d17}, d1
++
++ vzip.8 d0, d1
++ vqadd.s8 q0, q3
++ vsub.s8 q0, q15
++
++ bx lr
++endfunc
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3
++@
++@ q12, #0
++@ d16, d17 xlat U, V
++@ q14.u8 #2
++@ q15.u16 max
++function edge_16b_body_16
++ vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0
++ vadd.u16 q9, q14, q9
++ vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0
++ vsub.u16 q9, q9, q0
++ vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0
++ vadd.u16 q9, q9, q0
++ vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0
++ vsub.u16 q0, q9, q0
++
++ vmovn.s16 d0, q0
++ @ d1 will have random contents that we transform but
++ @ that doesn't matter as we then discard them
++ vuzp.8 d0, d1
++
++ vtbl.8 d0, {d16}, d0
++ vtbl.8 d1, {d17}, d1
++
++ vzip.8 d0, d1
++
++ vaddw.s8 q0, q1, d0
++
++ @ now clip
++ vmax.s16 q0, q12
++ vmin.s16 q0, q15
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only
++@ int eo, [sp, #sp_base + 0]
++@ int width, [sp, #sp_base + 4]
++@ int height) [sp, #sp_base + 8]
++
++@ Jumps via jump_tab with
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ EDGE_SRC_STRIDE [r3]
++@ (1 << \bit_depth) - 1 [r4]
++@ * xlat_table [r5] // setup_64b only
++@ int height [r12]
++@
++@ 0 [q12] // > 8 bit
++@ 2 [q14]
++@ 128 [q15] // = 8 bit
++@ r4 [q15] // > 8 bit
++
++.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
++
++@ Build translate registers
++@ As translate values can only be 0-4 we don't care about junk in the rest
++@ of the register
++.if \is_chroma
++ ldr ip, [sp, #0]
++ push {r4-r6, lr} @ 16 bytes
++ vld1.8 {d16[2]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[2]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[0]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[0]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[1]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[1]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[3]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[3]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[4]}, [r3]
++ vld1.8 {d17[4]}, [ip]
++ movw r3, EDGE_SRC_STRIDE
++.set sp_base, 20
++.else
++ add ip, r3, #4
++ vld1.8 {d16[1]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[0]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[0]}, [r3]
++ add r3, r3, #6
++ vld1.8 {d17[1]}, [ip]
++ vld1.8 {d16[2]}, [r3]
++ movw r3, EDGE_SRC_STRIDE
++ push {r4-r6, lr} @ 16 bytes
++ vzip.8 d16, d17
++ vmov d17, d16
++.set sp_base, 16
++.endif
++
++@ If setup_64b we need the xlat table on the stack
++.if \setup_64b
++ sub r5, sp, #16
++.endif
++
++@ Get jump address
++@ We have a special case for width 4 as the calling code doesn't detect it
++@ If we may have w4 then we add a 2nd jump table after the 1st
++.if \check_w4
++ ldr r12, [sp, #sp_base + 4] @ width
++ adr r6, \jump_tab
++ ldr lr, [sp, #sp_base + 0] @ e0
++ cmp r12, #8
++ it lt
++ addlt r6, #16
++.else
++ ldr lr, [sp, #sp_base + 0] @ e0
++ adr r6, \jump_tab
++.endif
++
++ ldr r12, [sp, #sp_base + 8] @ height
++
++.if \bit_depth > 8
++ movw r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++ vmov.i64 q12, #0
++ vdup.16 q15, r4
++ vmov.u16 q14, #2
++.else
++ vmov.u8 q15, #128
++ vmov.u8 q14, #2
++.endif
++.endif
++
++@ If setup_64b we need q4-q7 saved.
++.if \setup_64b
++ vpush {q4-q8} @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++ ldr r6, [r6, lr, lsl #2]
++
++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
++.if \do2
++ push {r0, r1, r6, r12}
++.if jent_pic
++ bl 98f
++.else
++ blx r6
++.endif
++ pop {r0, r1, r6, r12}
++
++ add r0, #64
++ add r1, #64
++.endif
++
++.if jent_pic
++ bl 98f
++.else
++ blx r6
++.endif
++
++@ Tidy up & return
++.if \setup_64b
++ vpop {q4-q8} @ spurious but harmless load of q8
++.endif
++ pop {r4-r6, pc}
++
++.if jent_pic && !\xjump
++@ Magic label - used as 98b in jent macro
++98:
++ add pc, r6
++.endif
++.endm
++
++
++.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
++.endm
++
++.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
++.endm
++
++
++.macro edge_64b_e0, body_fn, pb
++ sub r1, #8
++ mov r6, lr
++1: vldm r1, {d7-d16}
++ // load a
++ vext.8 q0, q3, q4, #(16 - \pb)
++ add r1, r3
++ vext.8 q1, q4, q5, #(16 - \pb)
++ subs r12, #1
++ vext.8 q2, q5, q6, #(16 - \pb)
++ vext.8 q3, q6, q7, #(16 - \pb)
++ pld [r1]
++ // load b
++ vext.8 q11, q7, q8, #\pb @ Avoid overwrite
++ pld [r1, #64]
++ vext.8 q8, q4, q5, #\pb
++ vext.8 q9, q5, q6, #\pb
++ vext.8 q10, q6, q7, #\pb
++ bl \body_fn
++ vstm r0, {q0-q3}
++ add r0, r0, r2
++ bgt 1b
++ bx r6
++.endm
++
++.macro edge_32bx2_e0, body_fn, pb
++ add r6, r1, r3
++ push {r7,lr}
++ sub r1, #8
++ add r7, r0, r2
++ lsl r2, #1
++1: vldmia r1, {d7-d12}
++ // load a
++ vext.8 q0, q3, q4, #16 - \pb
++ add r1, r1, r3, lsl #1
++ vext.8 q1, q4, q5, #16 - \pb
++ subs r12, #2
++ // load b
++ vext.8 q8, q4, q5, #\pb
++ vext.8 q9, q5, q6, #\pb
++ vldr d25, [r6, #-8]
++ vldmia r6, {d12-d15}
++ vldr d26, [r6, #32]
++ // load a
++ vext.8 q2, q12, q6, #16 - \pb
++ add r6, r6, r3, lsl #1
++ vext.8 q3, q6, q7, #16 - \pb
++ // load b
++ vext.8 q10, q6, q7, #\pb
++ vext.8 q11, q7, q13, #\pb
++ bl \body_fn
++ vst1.8 {q0-q1}, [r0, :256], r2
++ vst1.8 {q2-q3}, [r7, :256], r2
++ bgt 1b
++ pop {r7,pc}
++.endm
++
++.macro edge_16b_e0, body_fn, pb
++ sub r1, #8
++ mov r6, lr
++1: vldmia r1, {d1-d4}
++ add r1, r3
++ subs r12, #1
++ vext.8 q0, q0, q1, #16 - \pb
++ vext.8 q2, q1, q2, #\pb
++
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ bgt 1b
++ bx r6
++.endm
++
++.macro edge_8bx2_e0, body_fn, pb
++ add r6, r1, r3
++ push {r7,lr}
++ sub r1, #8
++ add r7, r0, r2
++ lsl r2, #1
++1: vldmia r1, {d1-d2}
++ vldmia r6, {d3-d4}
++ vldr d6, [r1, #16]
++ subs r12, #2
++ vldr d7, [r6, #-8]
++ add r1, r1, r3, lsl #1
++ vext.8 d0, d1, d2, #8 - \pb
++ add r6, r6, r3, lsl #1
++ vext.8 d5, d3, d4, #\pb
++ vext.8 d4, d2, d6, #\pb
++ vext.8 d1, d7, d3, #8 - \pb
++
++ bl \body_fn
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++ bgt 1b
++ pop {r7,pc}
++.endm
++
++.macro edge_4bx4_e0, body_fn, pb
++ add r6, r1, r3
++ push {r7,lr}
++ add r7, r0, r2
++ lsl r2, #1
++
++ tst r1, #4
++ bne 2f
++1: // r1 (and assumed r6) are 64-bit aligned
++ vldr d2, [r1]
++ vldr d0, [r1, #-8]
++ add r1, r1, r3, lsl #1
++ vldr d20, [r6]
++ subs r12, #4
++ vldr d18, [r6, #-8]
++ add r6, r6, r3, lsl #1
++ vldr d3, [r1]
++ vshr.u64 d4, d2, #\pb * 8
++ vldr d1, [r1, #-8]
++ add r1, r1, r3, lsl #1
++ vldr d21, [r6]
++ vext.8 d0, d0, d2, #8 - \pb
++ vldr d19, [r6,#-8]
++ add r6, r6, r3, lsl #1
++ vshr.u64 d22, d20, #\pb * 8
++ vext.8 d18, d18, d20, #8 - \pb
++ vshr.u64 d5, d3, #\pb * 8
++ vext.8 d1, d1, d3, #8 - \pb
++ vshr.u64 d23, d21, #\pb * 8
++ vext.8 d19, d19, d21, #8 - \pb
++ vsli.64 q1, q10, #32
++ vsli.64 q2, q11, #32
++ vsli.64 q0, q9, #32
++
++ bl \body_fn
++ vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vst1.32 {d1[1]}, [r7, :32], r2
++ bgt 1b
++ pop {r7,pc}
++
++2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned
++ vldr d20, [r1, #-4]
++ vldr d22, [r1, #4]
++ add r1, r1, r3, lsl #1
++ vldr d2, [r6, #-4]
++ subs r12, #4
++ vldr d4, [r6, #4]
++ add r6, r6, r3, lsl #1
++ vldr d21, [r1, #-4]
++ vshl.i64 d18, d20, #\pb * 8
++ vldr d23, [r1, #4]
++ add r1, r1, r3, lsl #1
++ vldr d3, [r6, #-4]
++ vext.8 d22, d20, d22, #\pb
++ vldr d5, [r6, #4]
++ add r6, r6, r3, lsl #1
++ vshl.i64 d0, d2, #\pb * 8
++ vext.8 d4, d2, d4, #\pb
++ vshl.i64 d19, d21, #\pb * 8
++ vext.8 d23, d21, d23, #\pb
++ vshl.i64 d1, d3, #\pb * 8
++ vext.8 d5, d3, d5, #\pb
++ vsri.64 q1, q10, #32
++ vsri.64 q0, q9, #32
++ vsri.64 q2, q11, #32
++
++ bl \body_fn
++ vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vst1.32 {d1[1]}, [r7, :32], r2
++ bgt 2b
++ pop {r7,pc}
++.endm
++
++
++.macro edge_64b_e1, body_fn
++ sub r1, r3
++ push {lr}
++ add r6, r1, #32
++ // load a
++ vld1.8 {q0-q1}, [r1, :256], r3
++ vld1.8 {q2-q3}, [r6, :256], r3
++ // load c
++ vld1.8 {q4-q5}, [r1, :256], r3
++ vld1.8 {q6-q7}, [r6, :256], r3
++1: // load b
++ vld1.8 {q8-q9}, [r1, :256], r3
++ subs r12, #1
++ vld1.8 {q10-q11}, [r6, :256], r3
++ bl \body_fn
++ vstm r0, {q0-q3}
++ // copy c to a
++ vmov.64 q0, q4
++ pld [r1, r3]
++ vmov.64 q1, q5
++ it le
++ pople {lr}
++ vmov.64 q2, q6
++ it le
++ bxle lr
++ vmov.64 q3, q7
++ add r0, r0, r2
++ // copy b to c
++ vmov.64 q4, q8
++ vmov.64 q5, q9
++ vmov.64 q6, q10
++ vmov.64 q7, q11
++ b 1b
++.endm
++
++.macro edge_32bx2_e1, body_fn
++ sub r6, r1, r3
++ vld1.8 {q2-q3}, [r1, :256], r3
++ vld1.8 {q0-q1}, [r6, :256]
++ mov r6, lr
++
++1: @ Given the data duplication here we could obviously do better than
++ @ using the generic body_fn but it almost certainly isn't worth it
++ vld1.8 {q8-q9}, [r1, :256], r3
++ subs r12, #2
++ vmov q4, q2
++ vmov q5, q3
++ vld1.8 {q10-q11}, [r1, :256], r3
++ vmov q6, q8
++ vmov q7, q9
++
++ bl \body_fn
++
++ vst1.8 {q0-q1}, [r0, :256], r2
++ // copy b to a
++ vmov q0, q8
++ vmov q1, q9
++ vst1.8 {q2-q3}, [r0, :256], r2
++ vmov q2, q10
++ it le
++ bxle r6
++ vmov q3, q11
++ b 1b
++.endm
++
++.macro edge_16b_e1, body_fn
++ sub r6, r1, r3
++ // load c
++ vld1.8 {q1}, [r1, :128], r3
++ // load a
++ vld1.8 {q0}, [r6, :128]
++ mov r6, lr
++1: // load b
++ vld1.8 {q2}, [r1, :128], r3
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ subs r12, #1
++ // copy c to a
++ vmov.64 q0, q1
++ it le
++ bxle r6
++ // copy b to c
++ vmov.64 q1, q2
++ b 1b
++.endm
++
++.macro edge_8bx2_e1, body_fn
++ sub r6, r1, r3
++ lsl r3, #1
++ push {r7, lr}
++ vld1.8 {d1}, [r1, :64], r3
++ vld1.8 {d0}, [r6, :64], r3
++ add r7, r0, r2
++ lsl r2, #1
++1: @ Given the data duplication here we could obviously do better than
++ @ using the generic body_fn but it almost certainly isn't worth it
++ vld1.8 {d4}, [r6, :64], r3
++ vmov d2, d1
++ vld1.8 {d5}, [r1, :64], r3
++ subs r12, #2
++ vmov d3, d4
++
++ bl \body_fn
++
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++
++ // copy b to a
++ vmov q0, q2
++ bgt 1b
++ pop {r7, pc}
++.endm
++
++.macro edge_4bx4_e1, body_fn
++ sub r6, r1, r3
++ lsl r3, #1
++ push {r7, lr}
++ vld1.32 {d0[1]}, [r1, :32], r3
++ add r7, r0, r2
++ vld1.32 {d0[0]}, [r6, :32], r3
++ lsl r2, #1
++ vld1.32 {d4[1]}, [r1, :32], r3
++ vld1.32 {d4[0]}, [r6, :32], r3
++ vld1.32 {d5[1]}, [r1, :32], r3
++ vld1.32 {d5[0]}, [r6, :32], r3
++ vmov d1, d4
++ vext.32 d2, d0, d4, #1
++ subs r12, #4
++ vmov d22, d5
++ vext.32 d3, d4, d5, #1
++ b 2f
++
++1: vst1.32 {d0[0]}, [r0, :32], r2
++ vext.32 d2, d22, d4, #1
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vmov d0, d22
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vext.32 d3, d4, d5, #1
++ vst1.32 {d1[1]}, [r7, :32], r2
++ vmov d1, d4
++ vmov d22, d5
++2: @ Given the data duplication here we could probably do better than
++ @ using the generic body_fn but it almost certainly isn't worth it
++ bl \body_fn
++ ble 3f
++ vld1.32 {d4[0]}, [r6, :32], r3
++ subs r12, #4
++ vld1.32 {d4[1]}, [r1, :32], r3
++ vld1.32 {d5[0]}, [r6, :32], r3
++ vld1.32 {d5[1]}, [r1, :32], r3
++ b 1b
++
++3: vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32]
++ vst1.32 {d1[1]}, [r7, :32]
++ pop {r7, pc}
++.endm
++
++.macro edge_64b_e2, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ // load c and a
++ vld1.8 {q4-q5}, [r1, :128]
++ vldr d25, [r6, #-8]
++ vldmia r6, {d16-d23}
++ vext.8 q0, q12, q8, #16 - \pb
++ add r6, r1, #32
++ vext.8 q1, q8, q9, #16 - \pb
++ add r1, r1, r3
++ vext.8 q2, q9, q10, #16 - \pb
++ vld1.8 {q6-q7}, [r6, :128]
++ sub r6, r1, r3
++ vext.8 q3, q10, q11, #16 - \pb
++
++1: // load b
++ vldmia r1, {d16-d24}
++ vext.8 q8, q8, q9, #\pb
++ pld [r1, r3]
++ vext.8 q9, q9, q10, #\pb
++ subs r12, #1
++ vext.8 q10, q10, q11, #\pb
++ vext.8 q11, q11, q12, #\pb
++ bl \body_fn
++ // next a is mostly available in c
++ vldr d25, [r6, #-8]
++ vstmia r0, {q0-q3}
++ vext.8 q3, q6, q7, #16 - \pb
++ it le
++ pople {lr}
++ vext.8 q2, q5, q6, #16 - \pb
++ it le
++ bxle lr
++ vext.8 q1, q4, q5, #16 - \pb
++ add r6, r6, r3
++ vext.8 q0, q12, q4, #16 - \pb
++ add r0, r0, r2
++ // next c is mostly available in b
++ vldr d8, [r1]
++ vext.8 d9, d16, d17, #8 - \pb
++ vext.8 q5, q8, q9, #16 - \pb
++ add r1, r1, r3
++ vext.8 q6, q9, q10, #16 - \pb
++ pld [r6, #-8]
++ vext.8 q7, q10, q11, #16 - \pb
++ b 1b
++.endm
++
++.macro edge_32bx2_e2, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ // load a and first 32b of c
++ vld1.8 {q4-q5}, [r1, :256]
++ vldr d25, [r6, #-8]
++ vld1.8 {q13-q14}, [r6, :256]
++ vldr d31, [r1, #-8]
++ add r6, r6, r3, lsl #1
++ vext.8 q0, q12, q13, #16 - \pb
++ add r1, r1, r3, lsl #1
++ vext.8 q1, q13, q14, #16 - \pb
++ vext.8 q2, q15, q4, #16 - \pb
++ vext.8 q3, q4, q5, #16 - \pb
++1:
++ // load second 32b of c and second 32b of b
++ vldmia r6, {d12-d16}
++ vldmia r1, {d20-d24}
++ // first 32b of b is mostly available in second 32b of c
++ vext.8 q9, q7, q8, #\pb
++ subs r12, #2
++ vext.8 q8, q6, q7, #\pb
++ vext.8 q10, q10, q11, #\pb
++ vext.8 q11, q11, q12, #\pb
++
++ bl \body_fn
++
++ vst1.8 {q0-q1}, [r0, :256], r2
++ vst1.8 {q2-q3}, [r7, :256], r2
++ ble 2f
++
++ vldr d25, [r6, #-8]
++ add r6, r6, r3, lsl #1
++ vldr d8, [r1]
++ vext.8 d9, d20, d21, #8 - \pb
++ vldr d31, [r1, #-8]
++ add r1, r1, r3, lsl #1
++ // first 32b of a is mostly available in second 32b of c
++ vext.8 q1, q6, q7, #16 - \pb
++ vext.8 q0, q12, q6, #16 - \pb
++ // first 32b of c is mostly available in second 32b of b
++ vext.8 q5, q10, q11, #16 - \pb
++ // second 32b of a is mostly available in first 32b of c
++ vext.8 q2, q15, q4, #16 - \pb
++ vext.8 q3, q4, q5, #16 - \pb
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_16b_e2, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ vld1.8 {q1}, [r1, :128], r3
++ vldr d19, [r6, #-8]
++ vld1.8 {q10}, [r6, :128], r3
++
++1: vldmia r1, {d4-d6}
++ vext.8 q0, q9, q10, #16 - \pb
++ subs r12, #1
++ vext.8 q2, q2, q3, #\pb
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ ble 2f
++ vmov q10, q1
++ vldr d2, [r1]
++ add r1, r1, r3
++ vldr d19, [r6, #-8]
++ add r6, r6, r3
++ vext.8 d3, d4, d5, #8 - \pb
++ b 1b
++
++2: pop {pc}
++.endm
++
++.macro edge_8bx2_e2, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ vldr d18, [r6, #-8]
++ vldr d19, [r6]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #-8]
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldmia r6, {d3-d4}
++ vld1.8 {d21-d22}, [r1, :128]
++
++1: vext.8 d0, d18, d19, #8 - \pb
++ vext.8 d4, d3, d4, #\pb
++ vext.8 d1, d20, d2, #8 - \pb
++ subs r12, #2
++ vext.8 d5, d21, d22, #\pb
++
++ bl \body_fn
++
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++ ble 2f
++
++ vldr d18, [r6, #-8]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #-8]
++ vmov d19, d3
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldmia r6, {d3-d4}
++ vld1.8 {d21-d22}, [r1, :128]
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_4bx4_e2, body_fn, pb
++ sub r6, r1, r3
++ push {r7-r9, lr}
++ add r8, r1, r3
++ sub r6, r6, #\pb
++ add r8, r8, #\pb
++ add r7, r0, r2
++ lsl r2, #1
++
++1: vld1.32 {d0[0]}, [r6], r3
++ subs r12, #4
++ vld1.32 {d2[0]}, [r1], r3
++ vld1.32 {d4[0]}, [r8], r3
++ vld1.32 {d0[1]}, [r6], r3
++ vld1.32 {d2[1]}, [r1], r3
++ vld1.32 {d4[1]}, [r8], r3
++ vld1.32 {d1[0]}, [r6], r3
++ vld1.32 {d3[0]}, [r1], r3
++ vld1.32 {d5[0]}, [r8], r3
++ vld1.32 {d1[1]}, [r6], r3
++ vld1.32 {d3[1]}, [r1], r3
++ vld1.32 {d5[1]}, [r8], r3
++
++ bl \body_fn
++
++ vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vst1.32 {d1[1]}, [r7, :32], r2
++ bgt 1b
++
++ pop {r7-r9,pc}
++.endm
++
++.macro edge_64b_e3, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ // load c and a
++ vld1.8 {q4-q5}, [r1, :128]
++ vldmia r6, {d16-d24}
++ vext.8 q0, q8, q9, #\pb
++ add r6, r1, #32
++ vext.8 q1, q9, q10, #\pb
++ add r1, r1, r3
++ vext.8 q2, q10, q11, #\pb
++ vld1.8 {q6-q7}, [r6, :128]
++ sub r6, r1, r3
++ vext.8 q3, q11, q12, #\pb
++
++1: // load b
++ vldr d17, [r1, #-8]
++ vldmia r1, {d18-d25}
++ vext.8 q8, q8, q9, #16 - \pb
++ pld [r1, r3]
++ vext.8 q9, q9, q10, #16 - \pb
++ subs r12, #1
++ vext.8 q10, q10, q11, #16 - \pb
++ vext.8 q11, q11, q12, #16 - \pb
++ bl \body_fn
++ // next a is mostly available in c
++ vldr d24, [r6, #64]
++ vstmia r0, {q0-q3}
++ vext.8 q0, q4, q5, #\pb
++ it le
++ pople {lr}
++ vext.8 q1, q5, q6, #\pb
++ it le
++ bxle lr
++ vext.8 q2, q6, q7, #\pb
++ add r6, r6, r3
++ vext.8 q3, q7, q12, #\pb
++ add r0, r0, r2
++ // next c is mostly available in b
++ vext.8 d14, d22, d23, #\pb
++ vldr d15, [r1, #56]
++ vext.8 q4, q8, q9, #\pb
++ add r1, r1, r3
++ vext.8 q5, q9, q10, #\pb
++ vext.8 q6, q10, q11, #\pb
++ b 1b
++.endm
++
++.macro edge_32bx2_e3, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ // load a and first 32b of c
++ vldmia r1, {d8-d12}
++ vldmia r6, {d24-d28}
++ vext.8 q2, q4, q5, #\pb
++ add r6, r6, r3, lsl #1
++ vext.8 q3, q5, q6, #\pb
++ add r1, r1, r3, lsl #1
++ vext.8 q0, q12, q13, #\pb
++ vext.8 q1, q13, q14, #\pb
++1:
++ // load second 32b of c and second 32b of b
++ vldr d25, [r6, #-8]
++ subs r12, #2
++ vldmia r6, {d12-d15}
++ vldr d27, [r1, #-8]
++ vldmia r1, {d20-d23}
++ // first 32b of b is mostly available in second 32b of c
++ vext.8 q8, q12, q6, #16 - \pb
++ vext.8 q9, q6, q7, #16 - \pb
++ vext.8 q11, q10, q11, #16 - \pb
++ vext.8 q10, q13, q10, #16 - \pb
++
++ bl \body_fn
++
++ vst1.8 {q0-q1}, [r0, :256], r2
++ vst1.8 {q2-q3}, [r7, :256], r2
++ ble 2f
++
++ vldr d24, [r6, #32]
++ add r6, r6, r3, lsl #1
++ vldr d11, [r1, #24]
++ vext.8 d10, d22, d23, #\pb
++ vldr d30, [r1, #32]
++ add r1, r1, r3, lsl #1
++ // first 32b of a is mostly available in second 32b of c
++ vext.8 q0, q6, q7, #\pb
++ vext.8 q1, q7, q12, #\pb
++ // first 32b of c is mostly available in second 32b of b
++ vext.8 q4, q10, q11, #\pb
++ // second 32b of a is mostly available in first 32b of c
++ vext.8 q3, q5, q15, #\pb
++ vext.8 q2, q4, q5, #\pb
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_16b_e3, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ vld1.8 {q1}, [r1, :128], r3
++ vldmia r6, {d18-d20}
++ add r6, r6, r3
++
++1: vldr d5, [r1, #-8]
++ vld1.8 {q3}, [r1, :128]
++ subs r12, #1
++ vext.8 q0, q9, q10, #\pb
++ vext.8 q2, q2, q3, #16 - \pb
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ ble 2f
++ vmov q9, q1
++ vldr d3, [r1, #8]
++ add r1, r1, r3
++ vldr d20, [r6, #16]
++ add r6, r6, r3
++ vext.8 d2, d4, d5, #\pb
++ b 1b
++
++2: pop {pc}
++.endm
++
++.macro edge_8bx2_e3, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ vld1.8 {d18-d19}, [r6]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #8]
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldr d4, [r6, #-8]
++ vldr d3, [r6]
++ vldr d21, [r1, #-8]
++ vldr d22, [r1]
++
++1: vext.8 d0, d18, d19, #\pb
++ vext.8 d4, d4, d3, #8 - \pb
++ vext.8 d1, d2, d20, #\pb
++ subs r12, #2
++ vext.8 d5, d21, d22, #8 - \pb
++
++ bl \body_fn
++
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++ ble 2f
++
++ vldr d19, [r6, #8]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #8]
++ vmov d18, d3
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldr d4, [r6, #-8]
++ vldr d3, [r6]
++ vldr d21, [r1, #-8]
++ vldr d22, [r1]
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_4bx4_e3, body_fn, pb
++ @ e3 is the same as e2 but with the X offset reversed
++ edge_4bx4_e2 \body_fn, (-\pb)
++.endm
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
++@ simpler and clearer in the code to stick with .word
++T .word (0 + \lab) - (4 + 98b)
++A .word (0 + \lab) - (8 + 98b)
++.else
++T .word 1 + \lab
++A .word \lab
++.endif
++.endm
++
++.macro edge_64b_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++
++0: edge_64b_e0 \body_fn, \pb
++10: edge_64b_e1 \body_fn
++20: edge_64b_e2 \body_fn, \pb
++30: edge_64b_e3 \body_fn, \pb
++.endm
++
++.macro edge_32bx2_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++
++0: edge_32bx2_e0 \body_fn, \pb
++10: edge_32bx2_e1 \body_fn
++20: edge_32bx2_e2 \body_fn, \pb
++30: edge_32bx2_e3 \body_fn, \pb
++.endm
++
++.macro edge_16b_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++
++0: edge_16b_e0 \body_fn, \pb
++10: edge_16b_e1 \body_fn
++20: edge_16b_e2 \body_fn, \pb
++30: edge_16b_e3 \body_fn, \pb
++.endm
++
++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++ jent 5f
++ jent 15f
++ jent 25f
++ jent 35f
++
++0: edge_32bx2_e0 \body_fn_64b, \pb
++10: edge_32bx2_e1 \body_fn_64b
++20: edge_32bx2_e2 \body_fn_64b, \pb
++30: edge_32bx2_e3 \body_fn_64b, \pb
++5: edge_16b_e0 \body_fn_16b, \pb
++15: edge_16b_e1 \body_fn_16b
++25: edge_16b_e2 \body_fn_16b, \pb
++35: edge_16b_e3 \body_fn_16b, \pb
++.endm
++
++.macro edge_16b_8bx2_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++ jent 5f
++ jent 15f
++ jent 25f
++ jent 35f
++
++0: edge_16b_e0 \body_fn, \pb
++10: edge_16b_e1 \body_fn
++20: edge_16b_e2 \body_fn, \pb
++30: edge_16b_e3 \body_fn, \pb
++5: edge_8bx2_e0 \body_fn, \pb
++15: edge_8bx2_e1 \body_fn
++25: edge_8bx2_e2 \body_fn, \pb
++35: edge_8bx2_e3 \body_fn, \pb
++.endm
++
++.macro edge_8bx2_4bx4_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++ jent 5f
++ jent 15f
++ jent 25f
++ jent 35f
++
++0: edge_8bx2_e0 \body_fn, \pb
++10: edge_8bx2_e1 \body_fn
++20: edge_8bx2_e2 \body_fn, \pb
++30: edge_8bx2_e3 \body_fn, \pb
++5: edge_4bx4_e0 \body_fn, \pb
++15: edge_4bx4_e1 \body_fn
++25: edge_4bx4_e2 \body_fn, \pb
++35: edge_4bx4_e3 \body_fn, \pb
++.endm
++
++@ void ff_hevc_rpi_sao_edge_8_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_8, export=1
++ edge_16b_init 8, 0, 1, 99f
++99:
++ edge_8bx2_4bx4_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_8, export=1
++ edge_16b_init 8, 0, 0, 99f
++99:
++ edge_16b_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_8, export=1
++ edge_64b_init 8, 0, 0, 99f
++99:
++ edge_32bx2_bodies edge_64b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_64_neon_8, export=1
++ edge_64b_init 8, 0, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_8, 1
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_8(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
++ edge_16b_init 8, 1, 1, 99f
++99:
++ edge_16b_8bx2_bodies edge_16b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_8(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
++ edge_64b_init 8, 1, 0, 99f
++99:
++ edge_32bx2_bodies edge_64b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_8(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
++ edge_64b_init 8, 1, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_8, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_8_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_10, export=1
++ edge_16b_init 10, 0, 1, 99f
++99:
++ edge_16b_8bx2_bodies edge_16b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_10, export=1
++ edge_64b_init 10, 0, 0, 99f
++99:
++ edge_32bx2_bodies edge_64b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++@ We simply split the 32 case into 2 vertical stripes
++@ and call the fns for w32
++@
++@ Calling code will always have src != dst so we don't have to worry
++@ about edge effects
++
++function ff_hevc_rpi_sao_edge_64_neon_10, export=1
++ edge_64b_init 10, 0, 1, 99f, xjump=1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_10, export=1
++ edge_64b_init 10, 0, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_16, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_10(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
++ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
++99:
++ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_10(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
++ edge_64b_init 10, 1, 1, 99f, xjump=1
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_10(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
++ edge_64b_init 10, 1, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_16, 4
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_arm.h
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
++#define AVCODEC_ARM_HEVCPRED_ARM_H
++
++#include "libavcodec/rpi_hevcpred.h"
++
++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
+@@ -0,0 +1,35 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/arm/cpu.h"
++
++#include "libavcodec/rpi_hevcpred.h"
++#include "rpi_hevcpred_arm.h"
++
++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ int cpu_flags = av_get_cpu_flags();
++
++ if (have_neon(cpu_flags))
++ ff_hevc_rpi_pred_init_neon(c, bit_depth);
++}
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
+@@ -0,0 +1,210 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcpred_arm.h"
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
++
++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ switch (bit_depth)
++ {
++ case 8:
++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8
++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
++ break;
++ case 10:
++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
++
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
++ break;
++ default:
++ break;
++ }
++}
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+@@ -0,0 +1,2984 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * General angular pred
++ *
++ * Horizontal (10) & Vertical (26) cases have their own file
++ * and are not dealt with properly here (luma filtering is missing)
++ *
++ * The inv_angle calculations are annoying - if it wasn't for the +128
++ * rounding step then the result would simply be the loop counter :-(
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.text
++
++@ Horizontal Patch functions
++@ These need a transpose before store so exist as smaller patches
++@ Patches can be called repeatedly without any intermediate setup
++@ to generate a horizontal block
++@
++@ It is almost certainly the case that larger patch fns can be built
++@ and they would be a little faster, but we would still need the small
++@ fns and code size (or at least instruction cache size) is an issue
++@ given how much code we already have here
++
++@ Generate 8x8 luma 8 patch
++@
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r10 Inv angle accumulator (_up only)
++@ r12 32 - angle frac (_down) or angle frac (_up)
++@ d0 Older reference samples
++@ d1=r8+r9 Newer reference samples
++@ d2 32 - angle frac
++@ d3 Angle frac
++@ q2 Partially computed next result (_up only)
++@
++@ Temps
++@ r5 Loop counter
++@ r6
++@ r7 (_down only)
++@ r11 (_up only)
++@ q2, q8-q11
++
++patch_h_down_8x8_8:
++ ldrd r8, r9, [r2] @ Left
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ lsr r8, #8
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #24
++ ldr r9, [r2, #5]!
++ vmov d1, r8, r9
++ // drop through...
++patch_h_down_8x8_8_continue:
++ mov r5, #8
++1:
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vext.8 q8, q8, q9, #8
++ itt mi
++ lsrmi r7, r8, #8
++ vmovmi d0, r8, r9
++ vdup.8 d2, r12
++ vext.8 q9, q9, q10, #8
++ it mi
++ orrmi r8, r7, r9, lsl #24
++ vext.8 q10, q10, q11, #8
++ it mi
++ ldrmi r9, [r2, #1]!
++ vmov d22, d23
++ vrshrn.u16 d23, q2, #5
++ it mi
++ vmovmi d1, r8, r9
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++ // drop through...
++store_tran_8x8_8:
++ vzip.8 d16, d17
++ add r6, r0, r3
++ vzip.8 d18, d19
++ lsl r3, #1
++ vzip.8 d20, d21
++ add r5, r0, r3
++ vzip.8 d22, d23
++ vzip.16 q8, q9
++ vzip.16 q10, q11
++ vzip.32 q8, q10
++ vzip.32 q9, q11
++ vst1.8 {d16}, [r0]!
++ vst1.8 {d17}, [r6], r3
++ vst1.8 {d20}, [r5], r3
++ vst1.8 {d21}, [r6], r3
++ vst1.8 {d18}, [r5], r3
++ vst1.8 {d19}, [r6], r3
++ vst1.8 {d22}, [r5]
++ asr r3, #1
++ vst1.8 {d23}, [r6]
++
++ bx lr
++
++patch_h_up_8x8_8:
++ ldrd r8, r9, [r2]
++ rsb r6, r4, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r4
++ lsr r11, r8, #24
++ vdup.8 d2, r6
++ ldr r8, [r2, #-1]!
++ orr r9, r11, r9, lsl #8
++ vmov d1, r8, r9
++ mov r12, r4
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++patch_h_up_8x8_8_continue:
++ mov r5, #8
++1:
++ add r12, r4
++ mov r11, #0
++ cmp r12, #33
++ it cs
++ addcs r10, r7
++ vext.8 q8, q8, q9, #8
++ itt cs
++ subcs r12, #32
++ tstcs r10, #1<<31
++ rsb r6, r12, #32
++ it eq
++ asreq r11, r10, #8
++ it cs
++ vmovcs d0, r8, r9
++ vdup.8 d2, r6
++ it cs
++ lsrcs r6, r8, #24
++ vext.8 q9, q9, q10, #8
++ itt cs
++ orrcs r9, r6, r9, lsl #8
++ ldrbcs r11, [r1, r11]
++ vdup.8 d3, r12
++ vext.8 q10, q10, q11, #8
++ it hi
++ ldrbhi r11, [r2, #-1]!
++ vmov d22, d23
++ vrshrn.u16 d23, q2, #5
++ itt cs
++ orrcs r8, r11, r8, lsl #8
++ vmovcs d1, r8, r9
++ vmull.u8 q2, d0, d2
++ subs r5, #1
++ vmlal.u8 q2, d1, d3
++ bne 1b
++
++ b store_tran_8x8_8
++
++
++.macro ADRT reg, val
++@ adr in T32 has enough range but not in A32
++A adrl \reg, \val
++T adr \reg, \val
++.endm
++
++@ ff_hevc_rpi_pred_angular_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ ldr lr, [r2], #1 @ Top
++ rsb r12, r6, #32
++ vmov s0, lr
++ vdup.8 d3, r6
++ ldr lr, [r2], #1
++ vdup.8 d2, r12
++ vmov s2, lr
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ itt mi
++ vmovmi s0, lr
++ ldrmi lr, [r2], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ vdup.8 d3, r6
++ mov r5, #2
++1:
++ vrshrn.u16 d20, q2, #5
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vext.64 q8, q8, q9, #1
++ it mi
++ vmovmi s0, lr
++ vext.64 q9, q9, q10, #1
++ it mi
++ ldrmi lr, [r2], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++
++ vrshrn.u16 d20, q2, #5
++ vmull.u8 q2, d0, d2
++ add r12, r0, r3
++ vmlal.u8 q2, d1, d3
++ lsl r3, #1
++ vext.64 q8, q8, q9, #1
++ vext.64 q9, q9, q10, #1
++ vrshrn.u16 d20, q2, #5
++
++98:
++ vst4.8 {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
++ vst4.8 {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
++ vst4.8 {d17[2], d18[2], d19[2], d20[2]}, [r0]
++ vst4.8 {d17[3], d18[3], d19[3], d20[3]}, [r12]
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ rsb r12, r6, #32
++ ldr lr, [r2] @ Left
++ ldrb r2, [r2, #-1] @ Top-left
++ vmov s0, lr
++ vdup.8 d2, r12
++ vdup.8 d3, r6
++ orr lr, r2, lr, lsl #8
++ vmov s2, lr
++ sub r8, r7, #128
++ mov r5, #3
++2:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++T it mi
++ addmi r12, #32
++T asr r6, r8, #8
++T it mi
++T ldrbmi r2, [r1, r6]
++A ldrbmi r2, [r1, r8, asr #8]
++ rsb r6, r12, #32
++ vdup.8 d2, r12
++ ittt mi
++ vmovmi s0, lr
++ orrmi lr, r2, lr, lsl #8
++ vmovmi s2, lr
++ vrshrn.u16 d20, q2, #5
++ vdup.8 d3, r6
++ it mi
++ addmi r8, r7
++ subs r5, #1
++ vext.64 q8, q8, q9, #1
++ vext.64 q9, q9, q10, #1
++ bne 2b
++
++ vmull.u8 q2, d0, d2
++ add r12, r0, r3
++ vmlal.u8 q2, d1, d3
++ lsl r3, #1
++ vrshrn.u16 d20, q2, #5
++ b 98b
++
++@ Left of vertical - works down left
++18:
++ ldrh r7, [r7]
++ rsb r12, r6, #32
++ ldr lr, [r1] @ Top
++ ldrb r1, [r2, #-1] @ Top-left
++ vmov s0, lr
++ vdup.8 d2, r12
++ vdup.8 d3, r6
++ orr lr, r1, lr, lsl #8
++ vmov s2, lr
++ sub r8, r7, #128
++ mov r5, #3
++2:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++T it mi
++ addmi r12, #32
++T asr r6, r8, #8
++T it mi
++T ldrbmi r1, [r2, r6]
++A ldrbmi r1, [r2, r8, asr #8]
++ rsb r6, r12, #32
++ vdup.8 d2, r12
++ ittt mi
++ vmovmi s0, lr
++ orrmi lr, r1, lr, lsl #8
++ vmovmi s2, lr
++ vrshrn.u16 d4, q2, #5
++ vdup.8 d3, r6
++ it mi
++ addmi r8, r7
++ subs r5, #1
++ vst1.32 {d4[0]}, [r0], r3
++ bne 2b
++
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d4, q2, #5
++ vst1.32 {d4[0]}, [r0]
++
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldr lr, [r1], #1 @ Top
++ rsb r12, r6, #32
++ vmov s0, lr
++ vdup.8 d3, r6
++ ldr lr, [r1], #1
++ vdup.8 d2, r12
++ vmov s2, lr
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ itt mi
++ vmovmi s0, lr
++ ldrmi lr, [r1], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ vdup.8 d3, r6
++ mov r5, #2
++1:
++ vrshrn.u16 d6, q2, #5
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vst1.32 {d6[0]}, [r0], r3
++ itt mi
++ vmovmi s0, lr
++ ldrmi lr, [r1], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++
++ vrshrn.u16 d6, q2, #5
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vst1.32 {d6[0]}, [r0], r3
++ vrshrn.u16 d6, q2, #5
++ vst1.32 {d6[0]}, [r0]
++
++ pop {r4-r8, pc}
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_8x8_8
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ bl patch_h_up_8x8_8
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ ldrb lr, [r2, #-1] @ Top-left
++ ldrh r7, [r7]
++ vmov d0, r8, r9
++ lsl r9, r9, #8
++ vdup.8 d2, r12
++ orr r9, r9, r8, lsr #24
++ orr r8, lr, r8, lsl #8
++ vmov d1, r8, r9
++ sub r1, r7, #128
++ mov r5, #7
++1:
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ subs r12, r12, r4
++ vmlal.u8 q2, d1, d3
++ ittt mi
++ addmi lr, r2, r1, asr #8
++ addmi r12, r12, #32
++ vmovmi d0, r8, r9
++ rsb r6, r12, #32
++ itt mi
++ lslmi r9, r9, #8
++ ldrbmi lr, [lr]
++ vdup.8 d2, r12
++ vrshrn.u16 d4, q2, #5
++ itttt mi
++ orrmi r9, r9, r8, lsr #24
++ orrmi r8, lr, r8, lsl #8
++ vmovmi d1, r8, r9
++ addmi r1, r1, r7
++ subs r5, r5, #1
++ vst1.8 {d4}, [r0], r3
++ bne 1b
++
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d4, q2, #5
++ vst1.8 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ mov r5, #7
++ lsr r8, #8
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #24
++ ldr r9, [r1, #5]!
++ vmov d1, r8, r9
++1:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++ it mi
++ addmi r12, #32
++ rsb r6, r12, #32
++ itt mi
++ vmovmi d0, r8, r9
++ lsrmi r8, #8
++ vdup.8 d2, r12
++ itt mi
++ orrmi r8, r8, r9, lsl #24
++ ldrmi r9, [r1, #1]!
++ vrshrn.u16 d6, q2, #5
++ it mi
++ vmovmi d1, r8, r9
++ vdup.8 d3, r6
++ subs r5, #1
++ vst1.8 {d6}, [r0], r3
++ bne 1b
++
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d6, q2, #5
++ vst1.8 {d6}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8_continue
++
++ add r2, r1, #8 @ restore r2, but 8 rows further down left
++ sub r0, #16
++ mov r6, r4
++ add r0, r0, r3, lsl #3
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8_continue
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++
++ push {r2}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8_continue
++ pop {r2}
++
++ sub r0, #16
++ mov r10, #-128
++ add r2, #8
++ add r0, r0, r3, lsl #3
++ sub r10, r10, r7, lsl #3
++
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8_continue
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q9}, [r1]
++ sub r1, r2, #1
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.8 d6, r6
++ vext.8 q8, q9, q9, #15
++ sub r8, r7, #128
++ vld1.8 {d16[0]}, [r1]
++ vdup.8 d7, r12
++ mov r5, #15
++1:
++ vmull.u8 q0, d18, d7
++ subs r12, r4
++ vmlal.u8 q0, d16, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d19, d7
++ it cc
++ addcc r1, r2, r8, asr #8
++ vmlal.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vext.8 q10, q8, q8, #15
++ sub r5, #1
++ vld1.8 {d20[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q11, q8
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmull.u8 q0, d22, d7
++ subs r12, r4
++ vmlal.u8 q0, d20, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d23, d7
++ it cc
++ addcc r1, r2, r8, asr #8
++ vmlal.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vext.8 q8, q10, q10, #15
++ sub r5, #1
++ vld1.8 {d16[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q9, q10
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d22, d7
++ vmlal.u8 q0, d20, d6
++ vmull.u8 q1, d23, d7
++ vmlal.u8 q1, d21, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d18, d7
++ vmlal.u8 q0, d16, d6
++ vmull.u8 q1, d19, d7
++ vmlal.u8 q1, d17, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vext.8 q8, q9, q9, #1
++ vld1.8 {d17[7]}, [r1]!
++ mov r5, #15
++1:
++ vmull.u8 q0, d16, d6
++ subs r12, r4
++ vmlal.u8 q0, d18, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d19, d7
++ sub r5, #1
++ vext.8 q10, q8, q8, #1
++ teq r5, #0
++ vld1.8 {d21[7]}, [r1]
++ it cc
++ addcc r1, #1
++ vmov q11, q8
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmull.u8 q0, d20, d6
++ subs r12, r4
++ vmlal.u8 q0, d22, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d23, d7
++ sub r5, #1
++ vext.8 q8, q10, q10, #1
++ teq r5, #0
++ vld1.8 {d17[7]}, [r1]
++ it cc
++ addcc r1, #1
++ vmov q9, q10
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d20, d6
++ vmlal.u8 q0, d22, d7
++ vmull.u8 q1, d21, d6
++ vmlal.u8 q1, d23, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d16, d6
++ vmlal.u8 q0, d18, d7
++ vmull.u8 q1, d17, d6
++ vmlal.u8 q1, d19, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8_continue
++ bl patch_h_down_8x8_8_continue
++ bl patch_h_down_8x8_8_continue
++
++ add r2, r1, #8 @ restore r2, but 8 rows further down left
++ add r1, r1, #8
++ mov r6, r4
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #3
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<2
++1:
++ push {r2,r10}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8_continue
++ bl patch_h_up_8x8_8_continue
++ bl patch_h_up_8x8_8_continue
++ pop {r2,r10}
++
++ vmov r8, s12
++ sub r0, #32
++ add r2, #8
++ add r0, r0, r3, lsl #3
++ sub r10, r10, r7, lsl #3
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q0-q1}, [r1]
++ sub r9, r2, #1
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ mov r5, #32
++1:
++ vld1.8 {d17[7]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ add r9, r2, r8, asr #8
++ vext.8 q1, q0, q1, #15
++ vext.8 q0, q8, q0, #15
++2:
++ vmull.u8 q10, d4, d19
++ subs r12, r4
++ vmlal.u8 q10, d0, d18
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d5, d19
++ rsb r6, r12, #32
++ vmlal.u8 q11, d1, d18
++ sub r5, #1
++ vmull.u8 q12, d6, d19
++ teq r5, #0
++ vmlal.u8 q12, d2, d18
++ vmull.u8 q13, d7, d19
++ vmlal.u8 q13, d3, d18
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.8 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.8 {d16[0]}, [r5]
++ mov r5, #32
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++1:
++ vmov q2, q0
++ add r1, #1
++ vmov q3, q1
++ vext.8 q0, q0, q1, #1
++ vext.8 q1, q1, q8, #1
++2:
++ vmull.u8 q10, d0, d18
++ subs r12, r4
++ vmlal.u8 q10, d4, d19
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d1, d18
++ rsb r6, r12, #32
++ vmlal.u8 q11, d5, d19
++ sub r5, #1
++ vmull.u8 q12, d2, d18
++ teq r5, #0
++ vmlal.u8 q12, d6, d19
++ vmull.u8 q13, d3, d18
++ vmlal.u8 q13, d7, d19
++ vld1.8 {d16[0]}, [r1]
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ Chroma 8 bit 4x4 patch fns
++ .text
++
++patch_h_down_c_4x4_8:
++ ldrd r8, r9, [r2] @ Left
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ lsr r8, #16
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r2, #6]!
++ vmov d1, r8, r9
++ // drop through...
++patch_h_down_c_4x4_8_continue:
++ mov r5, #4
++1:
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vext.8 q8, q8, q9, #8
++ it mi
++ lsrmi r7, r8, #16
++ vmov d18, d19
++ it mi
++ vmovmi d0, r8, r9
++ vdup.8 d2, r12
++ it mi
++ orrmi r8, r7, r9, lsl #16
++ vrshrn.u16 d19, q2, #5
++ itt mi
++ ldrmi r9, [r2, #2]!
++ vmovmi d1, r8, r9
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++ // drop through...
++store_tran_c_4x4_8:
++ vzip.16 d16, d17
++ add r6, r0, r3
++ vzip.16 d18, d19
++ lsl r3, #1
++ vzip.32 q8, q9
++ add r5, r0, r3
++ vst1.16 {d16}, [r0]!
++ vst1.16 {d17}, [r6], r3
++ vst1.16 {d18}, [r5]
++ asr r3, #1
++ vst1.16 {d19}, [r6]
++
++ bx lr
++
++patch_h_up_c_4x4_8:
++ ldrd r8, r9, [r2]
++ rsb r6, r4, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r4
++ lsr r11, r8, #16
++ vdup.8 d2, r6
++ ldr r8, [r2, #-2]!
++ orr r9, r11, r9, lsl #16
++ vmov d1, r8, r9
++ mov r12, r4
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++patch_h_up_c_4x4_8_continue:
++ mov r5, #4
++1:
++ add r12, r4
++ cmp r12, #33
++ it cs
++ addcs r10, r7
++ mov r11, #0
++ itt cs
++ subcs r12, #32
++ tstcs r10, #1<<31
++ rsb r6, r12, #32
++ it eq
++ asreq r11, r10, #7
++ it cs
++ vmovcs d0, r8, r9
++ it eq
++ biceq r11, #1
++ vdup.8 d2, r6
++ it cs
++ lsrcs r6, r8, #16
++ vdup.8 d3, r12
++ vext.8 q8, q8, q9, #8
++ itt cs
++ orrcs r9, r6, r9, lsl #16
++ ldrhcs r11, [r1, r11]
++ vmov d18, d19
++ it hi
++ ldrhhi r11, [r2, #-2]!
++ vrshrn.u16 d19, q2, #5
++ itt cs
++ orrcs r8, r11, r8, lsl #16
++ vmovcs d1, r8, r9
++ vmull.u8 q2, d0, d2
++ subs r5, #1
++ vmlal.u8 q2, d1, d3
++ bne 1b
++
++ b store_tran_c_4x4_8
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_c_4x4_8
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ bl patch_h_up_c_4x4_8
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ ldrh lr, [r2, #-2] @ Top-left
++ ldrh r7, [r7]
++ vmov d0, r8, r9
++ lsl r9, r9, #16
++ vdup.8 d2, r12
++ orr r9, r9, r8, lsr #16
++ orr r8, lr, r8, lsl #16
++ vmov d1, r8, r9
++ sub r1, r7, #128
++ mov r5, #3
++1:
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ subs r12, r12, r4
++ vmlal.u8 q2, d1, d3
++ itttt mi
++ addmi lr, r2, r1, asr #7
++ bicmi lr, #1
++ addmi r12, r12, #32
++ vmovmi d0, r8, r9
++ rsb r6, r12, #32
++ itt mi
++ lslmi r9, r9, #16
++ ldrhmi lr, [lr]
++ vdup.8 d2, r12
++ vrshrn.u16 d4, q2, #5
++ itttt mi
++ orrmi r9, r9, r8, lsr #16
++ orrmi r8, lr, r8, lsl #16
++ vmovmi d1, r8, r9
++ addmi r1, r1, r7
++ subs r5, r5, #1
++ vst1.16 {d4}, [r0], r3
++ bne 1b
++
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d4, q2, #5
++ vst1.16 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ mov r5, #3
++ lsr r8, #16
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r1, #6]!
++ vmov d1, r8, r9
++1:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++ it mi
++ addmi r12, #32
++ rsb r6, r12, #32
++ itt mi
++ vmovmi d0, r8, r9
++ lsrmi r8, #16
++ vdup.8 d2, r12
++ itt mi
++ orrmi r8, r8, r9, lsl #16
++ ldrmi r9, [r1, #2]!
++ vrshrn.u16 d6, q2, #5
++ it mi
++ vmovmi d1, r8, r9
++ vdup.8 d3, r6
++ subs r5, #1
++ vst1.16 {d6}, [r0], r3
++ bne 1b
++
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d6, q2, #5
++ vst1.16 {d6}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ sub r0, #16
++ mov r6, r4
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8_continue
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++
++ push {r2}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8_continue
++ pop {r2}
++
++ sub r0, #16
++ mov r10, #-128
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8_continue
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q9}, [r1]
++ sub r1, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.8 d6, r6
++ vext.8 q8, q9, q9, #14
++ sub r8, r7, #128
++ vld1.16 {d16[0]}, [r1]
++ vdup.8 d7, r12
++ mov r5, #7
++1:
++ subs r12, r4
++ vmull.u8 q0, d18, d7
++ it cc
++ asrcc r1, r8, #8
++ vmlal.u8 q0, d16, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d19, d7
++ it cc
++ addcc r1, r2, r1, lsl #1
++ vmlal.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vext.8 q10, q8, q8, #14
++ sub r5, #1
++ vld1.16 {d20[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q11, q8
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ subs r12, r4
++ vmull.u8 q0, d22, d7
++ it cc
++ asrcc r1, r8, #8
++ vmlal.u8 q0, d20, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d23, d7
++ it cc
++ addcc r1, r2, r1, lsl #1
++ vmlal.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vext.8 q8, q10, q10, #14
++ sub r5, #1
++ vld1.16 {d16[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q9, q10
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d22, d7
++ vmlal.u8 q0, d20, d6
++ vmull.u8 q1, d23, d7
++ vmlal.u8 q1, d21, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d18, d7
++ vmlal.u8 q0, d16, d6
++ vmull.u8 q1, d19, d7
++ vmlal.u8 q1, d17, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vext.8 q8, q9, q9, #2
++ vld1.16 {d17[3]}, [r1]!
++ mov r5, #7
++1:
++ vmull.u8 q0, d16, d6
++ subs r12, r4
++ vmlal.u8 q0, d18, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d19, d7
++ sub r5, #1
++ vext.8 q10, q8, q8, #2
++ teq r5, #0
++ vld1.16 {d21[3]}, [r1]
++ it cc
++ addcc r1, #2
++ vmov q11, q8
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmull.u8 q0, d20, d6
++ subs r12, r4
++ vmlal.u8 q0, d22, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d23, d7
++ sub r5, #1
++ vext.8 q8, q10, q10, #2
++ teq r5, #0
++ vld1.16 {d17[3]}, [r1]
++ it cc
++ addcc r1, #2
++ vmov q9, q10
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d20, d6
++ vmlal.u8 q0, d22, d7
++ vmull.u8 q1, d21, d6
++ vmlal.u8 q1, d23, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d16, d6
++ vmlal.u8 q0, d18, d7
++ vmull.u8 q1, d17, d6
++ vmlal.u8 q1, d19, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8_continue
++ bl patch_h_down_c_4x4_8_continue
++ bl patch_h_down_c_4x4_8_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*2
++ mov r6, r4
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<2
++1:
++ push {r2, r10}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8_continue
++ bl patch_h_up_c_4x4_8_continue
++ bl patch_h_up_c_4x4_8_continue
++ pop {r2, r10}
++
++ vmov r8, s12
++ sub r0, #32
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q0-q1}, [r1]
++ sub r9, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ mov r5, #16
++1:
++ vld1.16 {d17[3]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ asr r9, r8, #8
++ vext.8 q1, q0, q1, #14
++ add r9, r2, r9, lsl #1
++ vext.8 q0, q8, q0, #14
++2:
++ vmull.u8 q10, d4, d19
++ subs r12, r4
++ vmlal.u8 q10, d0, d18
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d5, d19
++ rsb r6, r12, #32
++ vmlal.u8 q11, d1, d18
++ sub r5, #1
++ vmull.u8 q12, d6, d19
++ teq r5, #0
++ vmlal.u8 q12, d2, d18
++ vmull.u8 q13, d7, d19
++ vmlal.u8 q13, d3, d18
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.8 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.16 {d16[0]}, [r5]
++ mov r5, #16
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++1:
++ vmov q2, q0
++ add r1, #2
++ vmov q3, q1
++ vext.8 q0, q0, q1, #2
++ vext.8 q1, q1, q8, #2
++2:
++ vmull.u8 q10, d0, d18
++ subs r12, r4
++ vmlal.u8 q10, d4, d19
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d1, d18
++ rsb r6, r12, #32
++ vmlal.u8 q11, d5, d19
++ sub r5, #1
++ vmull.u8 q12, d2, d18
++ teq r5, #0
++ vmlal.u8 q12, d6, d19
++ vmull.u8 q13, d3, d18
++ vmlal.u8 q13, d7, d19
++ vld1.16 {d16[0]}, [r1]
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++endfunc
++
++@------------------------------------------------------------------------------
++@ Data
++
++ .text
++ .balign 64
++angle_2:
++ .byte 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Sign inverted from standards table
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Standard sign
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++
++ .balign 2
++
++ @ Sign inverted from standards table
++inv_angle:
++ .short 4096, 1638, 910, 630, 482, 390, 315
++ .short 256
++ .short 315, 390, 482, 630, 910, 1638, 4096
++
++@------------------------------------------------------------------------------
++@
++@ 10 bit fns
++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
++@ but runs out of register width for 12+ bit
++
++ .text
++ .balign 64
++
++patch_h_down_4x4_10:
++ ldrd r8, r9, [r2] @ Left
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.16 d3, r6
++ lsr r8, #16
++ vdup.16 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r2, #6]!
++ vmov d1, r8, r9
++ // drop through...
++patch_h_down_4x4_10_continue:
++ mov r5, #4
++1:
++ subs r12, r4
++ vmul.u16 d4, d0, d2
++ it mi
++ addmi r12, #32
++ vmla.u16 d4, d1, d3
++ rsb r6, r12, #32
++ vext.16 q8, q8, q9, #4
++ it mi
++ lsrmi r7, r8, #16
++ vmov d18, d19
++ it mi
++ vmovmi d0, r8, r9
++ vdup.16 d2, r12
++ it mi
++ orrmi r8, r7, r9, lsl #16
++ vrshr.u16 d19, d4, #5
++ itt mi
++ ldrmi r9, [r2, #2]!
++ vmovmi d1, r8, r9
++ subs r5, #1
++ vdup.16 d3, r6
++ bne 1b
++ // drop through...
++store_tran_4x4_10:
++ vzip.16 d16, d17
++ add r6, r0, r3
++ vzip.16 d18, d19
++ lsl r3, #1
++ vzip.32 q8, q9
++ add r5, r0, r3
++ vst1.16 {d16}, [r0]!
++ vst1.16 {d17}, [r6], r3
++ vst1.16 {d18}, [r5]
++ asr r3, #1
++ vst1.16 {d19}, [r6]
++
++ bx lr
++
++patch_h_up_4x4_10:
++ ldrd r8, r9, [r2]
++ rsb r6, r4, #32
++ vmov d0, r8, r9
++ vdup.16 d3, r4
++ lsr r11, r8, #16
++ vdup.16 d2, r6
++ ldr r8, [r2, #-2]!
++ orr r9, r11, r9, lsl #16
++ vmov d1, r8, r9
++ mov r12, r4
++ vmul.u16 d4, d0, d2
++ vmla.u16 d4, d1, d3
++patch_h_up_4x4_10_continue:
++ mov r5, #4
++1:
++ add r12, r4
++ cmp r12, #33
++ it cs
++ addcs r10, r7
++ mov r11, #0
++ itt cs
++ subcs r12, #32
++ tstcs r10, #1<<31
++ rsb r6, r12, #32
++ it eq
++ asreq r11, r10, #7
++ it cs
++ vmovcs d0, r8, r9
++ it eq
++ biceq r11, #1
++ vdup.16 d2, r6
++ it cs
++ lsrcs r6, r8, #16
++ vdup.16 d3, r12
++ vext.16 q8, q8, q9, #4
++ itt cs
++ orrcs r9, r6, r9, lsl #16
++ ldrhcs r11, [r1, r11]
++ vmov d18, d19
++ it hi
++ ldrhhi r11, [r2, #-2]!
++ vrshr.u16 d19, d4, #5
++ itt cs
++ orrcs r8, r11, r8, lsl #16
++ vmovcs d1, r8, r9
++ vmul.u16 d4, d0, d2
++ subs r5, #1
++ vmla.u16 d4, d1, d3
++ bne 1b
++
++ b store_tran_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_4x4_10
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ bl patch_h_up_4x4_10
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ ldrh lr, [r2, #-2] @ Top-left
++ ldrh r7, [r7]
++ vmov d0, r8, r9
++ lsl r9, r9, #16
++ vdup.16 d2, r12
++ orr r9, r9, r8, lsr #16
++ orr r8, lr, r8, lsl #16
++ vmov d1, r8, r9
++ sub r1, r7, #128
++ mov r5, #3
++1:
++ sel lr, lr, lr @ force pipeline 0 on Cortex-A53
++ vdup.16 d3, r6
++ vmul.u16 d4, d0, d2
++ subs r12, r12, r4
++ vmla.u16 d4, d1, d3
++ itttt mi
++ addmi lr, r2, r1, asr #7
++ bicmi lr, #1
++ addmi r12, r12, #32
++ vmovmi d0, r8, r9
++ rsb r6, r12, #32
++ itt mi
++ lslmi r9, r9, #16
++ ldrhmi lr, [lr]
++ vdup.16 d2, r12
++ vrshr.u16 d4, d4, #5
++ itttt mi
++ orrmi r9, r9, r8, lsr #16
++ orrmi r8, lr, r8, lsl #16
++ vmovmi d1, r8, r9
++ addmi r1, r1, r7
++ subs r5, r5, #1
++ vst1.16 {d4}, [r0], r3
++ bne 1b
++
++ vdup.16 d3, r6
++ nop @ force next insn into pipeline 0 to enable
++ vmul.u16 d4, d0, d2 @ vmla to execute back-to-back on Cortex-A53
++ vmla.u16 d4, d1, d3
++ vrshr.u16 d4, d4, #5
++ vst1.16 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.16 d3, r6
++ lsr r8, #16
++ vdup.16 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r1, #6]!
++ vmov d1, r8, r9
++ mov r5, #3
++1:
++ vmul.u16 d4, d0, d2
++ subs r12, r4
++ vmla.u16 d4, d1, d3
++ it mi
++ addmi r12, #32
++ rsb r6, r12, #32
++ itt mi
++ vmovmi d0, r8, r9
++ lsrmi r8, #16
++ vdup.16 d2, r12
++ itt mi
++ orrmi r8, r8, r9, lsl #16
++ ldrmi r9, [r1, #2]!
++ vrshr.u16 d4, d4, #5
++ it mi
++ vmovmi d1, r8, r9
++ vdup.16 d3, r6
++ subs r5, #1
++ vst1.16 {d4}, [r0], r3
++ bne 1b
++
++ vmul.u16 d4, d0, d2
++ vmla.u16 d4, d1, d3
++ vrshr.u16 d4, d4, #5
++ vst1.16 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ sub r0, #16
++ mov r6, r4
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++
++ push {r2}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++ pop {r2}
++
++ sub r0, #16
++ mov r10, #-128
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q9}, [r1]
++ sub r1, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.16 q2, r6
++ vext.16 q8, q9, q9, #7
++ sub r8, r7, #128
++ vld1.16 {d16[0]}, [r1]
++ vdup.16 q3, r12
++ mov r5, #7
++1:
++ vmul.u16 q0, q9, q3
++ subs r12, r4
++ vmla.u16 q0, q8, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #1
++ vext.16 q10, q8, q8, #7
++ rsb r6, r12, #32
++ vmov q11, q8
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.16 {d20[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q11, q3
++ subs r12, r4
++ vmla.u16 q0, q10, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #1
++ vext.16 q8, q10, q10, #7
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.16 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q11, q3
++ vmla.u16 q0, q10, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q9, q3
++ vmla.u16 q0, q8, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vext.16 q8, q9, q9, #1
++ vld1.16 {d17[3]}, [r1]!
++ mov r5, #7
++1:
++ vmul.u16 q0, q8, q2
++ subs r12, r4
++ vmla.u16 q0, q9, q3
++ it cc
++ addcc r12, #32
++ vext.16 q10, q8, q8, #1
++ rsb r6, r12, #32
++ vld1.16 {d21[3]}, [r1]
++ sub r5, #1
++ vmov q11, q8
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #2
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q10, q2
++ subs r12, r4
++ vmla.u16 q0, q11, q3
++ it cc
++ addcc r12, #32
++ vext.16 q8, q10, q10, #1
++ rsb r6, r12, #32
++ vld1.16 {d17[3]}, [r1]
++ sub r5, #1
++ vmov q9, q10
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #2
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q10, q2
++ vmla.u16 q0, q11, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q8, q2
++ vmla.u16 q0, q9, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*2
++ mov r6, r4
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<2
++1:
++ push {r2, r10}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ pop {r2, r10}
++
++ vmov r8, s12
++ sub r0, #32
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q0-q1}, [r1]
++ sub r9, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ mov r5, #16
++1:
++ vld1.16 {d17[3]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ asr r9, r8, #8
++ vext.16 q1, q0, q1, #7
++ add r9, r2, r9, lsl #1
++ vext.16 q0, q8, q0, #7
++2:
++ vmul.u16 q11, q2, q10
++ subs r12, r4
++ vmla.u16 q11, q0, q9
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q3, q10
++ rsb r6, r12, #32
++ vmla.u16 q12, q1, q9
++ sub r5, #1
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.16 {d16[0]}, [r5]
++ mov r5, #16
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++1:
++ vmov q2, q0
++ add r1, #2
++ vmov q3, q1
++ vext.16 q0, q0, q1, #1
++ vext.16 q1, q1, q8, #1
++2:
++ vmul.u16 q11, q0, q9
++ subs r12, r4
++ vmla.u16 q11, q2, q10
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q1, q9
++ rsb r6, r12, #32
++ vmla.u16 q12, q3, q10
++ sub r5, #1
++ vld1.16 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ vpush {d8}
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ add sp, #8
++ mov r10, #8
++ mov r1, r2
++1:
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*2
++ mov r6, r4
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ add sp, #8
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<6
++1:
++ push {r2, r10}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ pop {r2, r10}
++
++ vmov r8, s12
++ sub r0, #64
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ sub r9, r2, #2
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ ldrh r7, [r7]
++ mov r8, #-128
++ vmov d0, d9
++ vmov s2, r12
++ add r10, r0, #32
++ mov r5, #32
++1:
++ vld1.16 {d1[3]}, [r9]
++ add r8, r7
++ vmov q11, q4
++ vmov q10, q3
++ asr r9, r8, #8
++ vmov q9, q2
++ add r9, r2, r9, lsl #1
++ vmov q8, q1
++ vext.16 q4, q3, q4, #7
++ vext.16 q3, q2, q3, #7
++ vext.16 q2, q1, q2, #7
++ vext.16 q1, q0, q1, #7
++2:
++ vmul.u16 q12, q8, d1[1]
++ adds r12, r4
++ vmla.u16 q12, q1, d1[0]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q9, d1[1]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q2, d1[0]
++ sub r5, #1
++ vmul.u16 q14, q10, d1[1]
++ teq r5, #0
++ vmla.u16 q14, q3, d1[0]
++ vmul.u16 q15, q11, d1[1]
++ vmla.u16 q15, q4, d1[0]
++ vmov s2, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d0
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ add r1, r1, #64
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ vmov d1, d9
++ vmov s1, r12
++ add r10, r0, #32
++ mov r5, #32
++1:
++ vld1.16 {d0[0]}, [r1]!
++ vmov q8, q1
++ vmov q9, q2
++ vmov q10, q3
++ vmov q11, q4
++ vext.16 q1, q1, q2, #1
++ vext.16 q2, q2, q3, #1
++ vext.16 q3, q3, q4, #1
++ vext.16 q4, q4, q0, #1
++2:
++ vmul.u16 q12, q1, d0[2]
++ adds r12, r4
++ vmla.u16 q12, q8, d0[3]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q2, d0[2]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q9, d0[3]
++ sub r5, #1
++ vmul.u16 q14, q3, d0[2]
++ teq r5, #0
++ vmla.u16 q14, q10, d0[3]
++ vmul.u16 q15, q4, d0[2]
++ vmla.u16 q15, q11, d0[3]
++ vmov s1, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d1
++ pop {r4-r11, pc}
++
++endfunc
++
++
++
++@ Generate 4x4 chroma patch
++@
++@ In (const)
++@ r1 Up ptr (_up only)
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r6 Angle frac (init to r4 + 32)
++@ r8 Inv angle accumulator
++@ q2 Cur Line - load before 1st call for down - set by _up
++@ q8 Cur Line - load before 1st call for up - set by _down
++@
++@ Temps
++@ r5 Loop counter
++@ r12
++@ d0, q1, q12-q15
++
++patch_h_down_c_4x4_10:
++ vld1.16 {q12}, [r2]!
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ mov r5, #4
++1:
++ vmov q13, q12
++ vext.16 q12, q12, q12, #2
++ vld1.32 {d25[1]}, [r2]!
++patch_h_down_c_4x4_10_continue:
++2:
++ vmov q8, q9
++ subs r12, r4
++ vmul.u16 q0, q13, q3
++ it cc
++ addcc r12, #32
++ vmla.u16 q0, q12, q2
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vmov q10, q11
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vrshr.u16 q11, q0, #5
++ bhi 2b
++ bne 1b
++
++ bcs 3f
++ vmov q13, q12
++ vext.16 q12, q12, q12, #2
++ vld1.32 {d25[1]}, [r2]!
++3:
++
++store_tran_c_4x4_10:
++T add r6, r0, r3
++ vzip.32 q8, q10
++A add r6, r0, r3
++T lsl r3, #1
++ vzip.32 q9, q11
++A add r5, r0, r3, lsl #1
++T add r5, r0, r3
++ vst2.32 {d16,d18}, [r0]!
++A lsl r3, #1
++ vst2.32 {d17,d19}, [r6], r3
++ asr r3, #1
++ vst2.32 {d20,d22}, [r5]
++ mov r5, #4
++ vst2.32 {d21,d23}, [r6]
++ bx lr
++
++patch_h_up_c_4x4_10:
++ vld1.16 {q1}, [r2]
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ mov r5, #4
++1:
++ adds r8, r7
++ vmov q12, q1
++ it mi
++ ldrmi r6, [r2, #-4]!
++ vext.16 q1, q1, q1, #6
++ itt pl
++ asrpl r6, r8, #8
++ ldrpl r6, [r1, r6, lsl #2]
++ vmov s4, r6
++patch_h_up_c_4x4_10_continue:
++2:
++ vmov q8, q9
++ subs r12, r4
++ vmul.u16 q0, q12, q3
++ it cc
++ addcc r12, #32
++ vmla.u16 q0, q1, q2
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vmov q10, q11
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vrshr.u16 q11, q0, #5
++ bhi 2b
++ bne 1b
++
++ bcs store_tran_c_4x4_10
++ adds r8, r7
++ vmov q12, q1
++ it mi
++ ldrmi r6, [r2, #-4]!
++ vext.16 q1, q1, q1, #6
++ itt pl
++ asrpl r6, r8, #8
++ ldrpl r6, [r1, r6, lsl #2]
++ vmov s4, r6
++ b store_tran_c_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #2
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r8, #-128
++ sub r8, r7
++ bl patch_h_up_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q9}, [r1]
++ sub r1, r2, #4
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.16 q2, r6
++ vext.16 q8, q9, q9, #6
++ sub r8, r7, #128
++ vld1.32 {d16[0]}, [r1]
++ vdup.16 q3, r12
++ mov r5, #3
++1:
++ vmul.u16 q0, q9, q3
++ subs r12, r4
++ vmla.u16 q0, q8, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #2
++ vext.16 q10, q8, q8, #6
++ rsb r6, r12, #32
++ vmov q11, q8
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.32 {d20[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q11, q3
++ subs r12, r4
++ vmla.u16 q0, q10, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #2
++ vext.16 q8, q10, q10, #6
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.32 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q11, q3
++ vmla.u16 q0, q10, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q9, q3
++ vmla.u16 q0, q8, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vext.16 q8, q9, q9, #2
++ vld1.32 {d17[1]}, [r1]!
++ mov r5, #3
++1:
++ vmul.u16 q0, q8, q2
++ subs r12, r4
++ vmla.u16 q0, q9, q3
++ it cc
++ addcc r12, #32
++ vext.16 q10, q8, q8, #2
++ rsb r6, r12, #32
++ vld1.32 {d21[1]}, [r1]
++ sub r5, #1
++ vmov q11, q8
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #4
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q10, q2
++ subs r12, r4
++ vmla.u16 q0, q11, q3
++ it cc
++ addcc r12, #32
++ vext.16 q8, q10, q10, #2
++ rsb r6, r12, #32
++ vld1.32 {d17[1]}, [r1]
++ sub r5, #1
++ vmov q9, q10
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #4
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q10, q2
++ vmla.u16 q0, q11, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q8, q2
++ vmla.u16 q0, q9, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #2
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10_continue
++
++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left
++ sub r0, #32
++ mov r6, r4
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10_continue
++
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r8, #-128
++ sub r8, r7
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10_continue
++ pop {r2, r8}
++
++ sub r0, #32
++ mov r6, r4
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10_continue
++
++ pop {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q0-q1}, [r1]
++ sub r9, r2, #4
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ mov r5, #8
++1:
++ vld1.32 {d17[1]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ asr r9, r8, #8
++ vext.16 q1, q0, q1, #6
++ add r9, r2, r9, lsl #2
++ vext.16 q0, q8, q0, #6
++2:
++ vmul.u16 q11, q2, q10
++ subs r12, r4
++ vmla.u16 q11, q0, q9
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q3, q10
++ rsb r6, r12, #32
++ vmla.u16 q12, q1, q9
++ sub r5, #1
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.32 {d16[0]}, [r5]
++ mov r5, #8
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++1:
++ vmov q2, q0
++ add r1, #4
++ vmov q3, q1
++ vext.16 q0, q0, q1, #2
++ vext.16 q1, q1, q8, #2
++2:
++ vmul.u16 q11, q0, q9
++ subs r12, r4
++ vmla.u16 q11, q2, q10
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q1, q9
++ rsb r6, r12, #32
++ vmla.u16 q12, q3, q10
++ sub r5, #1
++ vld1.32 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r10, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #2
++ vpush {d8}
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ add sp, #8
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10_continue
++ bl patch_h_down_c_4x4_10_continue
++ bl patch_h_down_c_4x4_10_continue
++
++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*4
++ mov r6, r4
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ add sp, #8
++ mov r10, #4
++ ldrh r7, [r7]
++ mov r8, #-128
++ sub r8, r7
++2:
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10_continue
++ bl patch_h_up_c_4x4_10_continue
++ bl patch_h_up_c_4x4_10_continue
++ pop {r2, r8}
++
++ sub r0, #64
++ mov r6, r4
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ subs r10, #1
++ bne 2b
++
++ pop {r4-r10, pc}
++
++@ Left of vertical - works down left
++18:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ sub r9, r2, #4
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ ldrh r7, [r7]
++ mov r8, #-128
++ vmov d0, d9
++ vmov s2, r12
++ add r10, r0, #32
++ mov r5, #16
++1:
++ vld1.32 {d1[1]}, [r9]
++ add r8, r7
++ vmov q11, q4
++ vmov q10, q3
++ asr r9, r8, #8
++ vmov q9, q2
++ add r9, r2, r9, lsl #2
++ vmov q8, q1
++ vext.16 q4, q3, q4, #6
++ vext.16 q3, q2, q3, #6
++ vext.16 q2, q1, q2, #6
++ vext.16 q1, q0, q1, #6
++2:
++ vmul.u16 q12, q8, d1[1]
++ adds r12, r4
++ vmla.u16 q12, q1, d1[0]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q9, d1[1]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q2, d1[0]
++ sub r5, #1
++ vmul.u16 q14, q10, d1[1]
++ teq r5, #0
++ vmla.u16 q14, q3, d1[0]
++ vmul.u16 q15, q11, d1[1]
++ vmla.u16 q15, q4, d1[0]
++ vmov s2, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d0
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ add r1, r1, #64
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ vmov d1, d9
++ vmov s1, r12
++ add r10, r0, #32
++ mov r5, #16
++1:
++ vld1.32 {d0[0]}, [r1]!
++ vmov q8, q1
++ vmov q9, q2
++ vmov q10, q3
++ vmov q11, q4
++ vext.16 q1, q1, q2, #2
++ vext.16 q2, q2, q3, #2
++ vext.16 q3, q3, q4, #2
++ vext.16 q4, q4, q0, #2
++2:
++ vmul.u16 q12, q1, d0[2]
++ adds r12, r4
++ vmla.u16 q12, q8, d0[3]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q2, d0[2]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q9, d0[3]
++ sub r5, #1
++ vmul.u16 q14, q3, d0[2]
++ teq r5, #0
++ vmla.u16 q14, q10, d0[3]
++ vmul.u16 q15, q4, d0[2]
++ vmla.u16 q15, q11, d0[3]
++ vmov s1, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d1
++ pop {r4-r10, pc}
++
++endfunc
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+@@ -0,0 +1,705 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_8, export=1
++
++ @ Average the els of top & left
++ ldr r2, [r2]
++ vld1.32 {d0[0]}, [r1]
++ mov r1, #2
++ vmov s1, r2
++ vmov s2, r2
++ vmov.i16 q2, #3
++ add r2, r0, r3
++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0]
++ lsl r3, #1
++ vmovl.u8 q0, d0
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.32 {d0[0]}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d1, d0, #5*8
++ vshr.u64 d2, d0, #6*8
++ vshr.u64 d3, d0, #7*8
++ vbif d1, d6, d7
++ vbif d2, d6, d7
++ vst1.32 {d1[0]}, [r2], r3
++ vbif d3, d6, d7
++ vst1.32 {d2[0]}, [r0]
++ vst1.32 {d3[0]}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ vld1.8 {d1}, [r2]
++A add r2, r0, r3, lsl #1
++A lsl r3, #2
++T lsl r3, #1
++T add r2, r0, r3
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshrn.u16 d0, q1, #3
++
++ @ Store
++ vst1.8 {d0}, [r0], r3
++ vst1.8 {d0}, [r2], r3
++ vst1.8 {d0}, [r0]
++ vst1.8 {d0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ mov r1, #2
++ vld1.8 {d16}, [r2]
++ vmov.i16 q2, #3
++ vmov.i64 d7, #0xffff
++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0]
++ vmovl.u8 q0, d0
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vmovl.u8 q1, d16
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q1, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d2, q1, #2
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.8 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d2, #8
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ mov r1, #6
++1:
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ subs r1, #2
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0}, [r1]
++ mov r1, #8
++ vld1.8 {q1}, [r2]
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++A add r2, r0, r3, lsl #1
++A lsl r3, #2
++T add r2, r0, r3
++T lsl r3, #1
++ vaddl.u8 q1, d2, d3
++ vadd.i16 q1, q0
++ vadd.i16 d3, d2 @ d3 has 2 val pairs
++ vpadd.i32 d2, d3, d3 @ This add U & V separately
++ vpadd.i32 d3, d3, d3
++ vrshrn.u16 d0, q1, #4
++ vrshrn.u16 d1, q1, #4
++
++ @ Store
++1:
++ vst1.8 {q0}, [r0], r3
++ subs r1, #4
++ vst1.8 {q0}, [r2], r3
++ vst1.8 {q0}, [r0], r3
++ vst1.8 {q0}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q8}, [r1]
++ mov r1, #2
++ vld1.8 {q9}, [r2]
++ vaddl.u8 q10, d16, d17
++ vaddl.u8 q11, d16, d18
++ vaddl.u8 q0, d18, d19
++ vmov.i16 q1, #3
++ vadd.i16 q10, q0
++ vmovl.u8 q0, d18
++ vadd.i16 d20, d21
++ vmov.i16 d2[0], r1 @ 2, 3, 3, 3...
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmovl.u8 q2, d16
++ vmovl.u8 q9, d19
++ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same)
++ vmov.i64 d7, #0xffff
++ vmovl.u8 q8, d17
++ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7]
++ vmov.i64 d7, #0xff
++ vpadd.i16 d20, d20 @ 1 (all the same)
++ vrshr.u16 d21, d20, #5
++ vrshr.u16 d20, d20, #5
++ vmla.i16 q0, q10, d2[1]
++ vmla.i16 q9, q10, d2[1]
++ vmla.i16 q2, q10, q1
++ vmla.i16 q8, q10, d2[1]
++ vdup.8 q1, d20[0]
++ vrshrn.i16 d0, q0, #2
++ vrshrn.i16 d1, q9, #2
++ vrshrn.i16 d4, q2, #2
++ vrshrn.i16 d5, q8, #2
++ vext.8 q0, q0, q0, #1
++
++ @ Store top line
++ vst1.8 {q2}, [r0], r3
++
++ @ Store the rest
++ mov r1, #15
++1:
++ vbit d2, d0, d7
++ vext.8 q0, q0, q0, #1
++ subs r1, #1
++ vst1.8 {q1}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0-q1}, [r1]
++ mov r1, #16
++ vld1.8 {q2-q3}, [r2]
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vaddl.u8 q1, d2, d3
++A lsl r3, #2
++T lsl r3, #1
++ vaddl.u8 q2, d4, d5
++ vaddl.u8 q3, d6, d7
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d4, d0, d0 @ This adds U & V separately
++ vpadd.i32 d5, d0, d0
++ vrshrn.u16 d0, q2, #5
++ vrshrn.u16 d1, q2, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q2, #5
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_32_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0-q1}, [r1]
++ mov r1, #32
++ vld1.8 {q2-q3}, [r2]
++ add r2, r0, r3
++ vaddl.u8 q0, d0, d1
++ lsl r3, #1
++ vaddl.u8 q1, d2, d3
++ vaddl.u8 q2, d4, d5
++ vaddl.u8 q3, d6, d7
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d4, d0, d0 @ 1 (all the same)
++ vpadd.i16 d5, d0, d0
++ vrshrn.u16 d0, q2, #6
++ vrshrn.u16 d1, q2, #6
++ vrshrn.u16 d2, q2, #6
++ vrshrn.u16 d3, q2, #6
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ -----------------------------------------------------------------------------
++@
++@ 10 Bit versions
++@
++@ There is no actual bit depth dependency in this code except that our
++@ intermediate results will overflow the 16 bits they are stored in
++@ All there functions are good to 10 bits - with the worst case being
++@ in dc_32 where we use all 16 bits.
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {d0}, [r1]
++ mov r1, #2
++ vld1.16 {d1}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vmov.i64 d7, #0xffff
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vrshr.u16 q0, #2
++
++ @ Store top line
++ vst1.16 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d3, d1, #1*16
++ vshr.u64 d4, d1, #2*16
++ vshr.u64 d5, d1, #3*16
++ vbif d3, d6, d7
++ vbif d4, d6, d7
++ vst1.16 {d3}, [r2], r3
++ vbif d5, d6, d7
++ vst1.16 {d4}, [r0]
++ vst1.16 {d5}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0}, [r1]
++ vld1.8 {q1}, [r2]
++A add r2, r0, r3, lsl #2
++A lsl r3, #3
++T lsl r3, #2
++T add r2, r0, r3
++T lsl r3, #1
++ vadd.i16 q0, q1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshr.u16 q0, q1, #3
++
++ vst1.16 {q0}, [r0], r3
++ vst1.16 {q0}, [r2], r3
++ vst1.16 {q0}, [r0]
++ vst1.16 {q0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q0}, [r1]
++ mov r1, #2
++ vld1.16 {q8}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q8, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.16 q2, d6[0]
++ vdup.16 q9, d6[0]
++ vrshr.u16 q8, q8, #2
++ vrshr.u16 q0, q0, #2
++ vext.16 q1, q8, q8, #1
++
++ @ Store top line
++ vst1.16 {q0}, [r0], r3
++
++ @ Store the rest
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ mov r1, #6
++1:
++ vext.16 q8, q8, q8, #2
++ subs r1, #2
++ vext.16 q1, q1, q1, #2
++ vbit d4, d16, d7
++ vst1.16 {q2}, [r0], r3
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q0-q1}, [r1]
++ mov r1, #8
++ vld1.16 {q2-q3}, [r2]
++T lsl r3, #2
++ vadd.i16 q1, q0
++A add r2, r0, r3, lsl #2
++A lsl r3, #3
++T add r2, r0, r3
++T lsl r3, #1
++ vadd.i16 q2, q3
++ vadd.i16 q1, q2
++ vadd.i16 d3, d2 @ d3 has 2 val pairs
++ vpadd.i32 d2, d3, d3 @ This add U & V separately
++ vpadd.i32 d3, d3, d3
++ vrshr.u16 q0, q1, #4
++ vrshr.u16 q1, q1, #4
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q8-q9}, [r1]
++ mov r1, #2
++ vld1.16 {q10-q11}, [r2]
++ lsl r3, #1 @ stride given in pels
++ vadd.i16 q0, q8, q9
++ vadd.i16 q1, q10, q11
++ vmov.i16 q3, #3
++ vadd.i16 q1, q0
++ vadd.i16 d0, d16, d20
++ vmov.i64 d31, #0xffff
++ vadd.i16 d3, d2
++ vmov.16 d6[0], r1 @ 2, 3, 3, 3...
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ topline[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7]
++ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d3, d3 @ 1 (all the same)
++ vrshr.u16 d2, d3, #5
++ vrshr.u16 d3, d3, #5
++ vmov q0, q1
++ vmla.i16 q10, q1, d6[1]
++ vmla.i16 q11, q1, d6[1]
++ vmla.i16 q8, q1, q3
++ vmla.i16 q9, q1, d6[1]
++ vrshr.u16 q2, q10, #2
++ vrshr.u16 q3, q11, #2
++ vrshr.u16 q8, #2
++ vrshr.u16 q9, #2
++ vext.16 q2, q2, q2, #1
++ mov r1, #7<<29
++
++ @ Store top line
++ vst1.16 {q8-q9}, [r0], r3
++
++ @ Store the rest
++1:
++ vbit d0, d4, d31
++ vext.16 q2, q2, q2, #1
++ subs r1, #1<<29
++ vst1.16 {q0-q1}, [r0], r3
++ bne 1b
++1:
++ vbit d0, d6, d31
++ vext.16 q3, q3, q3, #1
++ subs r1, #1<<29
++ vst1.16 {q0-q1}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vldm r1, {q0-q3}
++ vldm r2, {q8-q11}
++ vadd.i16 q0, q1
++ mov r1, #16
++ vadd.i16 q2, q3
++ add r2, r0, #32
++ vadd.i16 q8, q9
++ lsl r3, #2
++ vadd.i16 q10, q11
++ vadd.u16 q0, q2
++ vadd.u16 q8, q10
++ vadd.i16 q0, q8
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d4, d0, d0 @ This adds U & V separately
++ vpadd.i32 d5, d0, d0
++ vrshr.u16 q0, q2, #5
++ vrshr.u16 q1, q2, #5
++
++ @ Store
++1:
++ vst1.16 {q0-q1}, [r0], r3
++ subs r1, #1
++ vst1.16 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels)
++
++function ff_hevc_rpi_pred_dc_32_neon_10, export=1
++
++ @ Average the els of top & left
++ @ With 10 bits we are (just) safe from overflow in i16
++ vldm r1, {q0-q3}
++ vldm r2, {q8-q11}
++ vadd.i16 q0, q1
++ mov r1, #32
++ vadd.i16 q2, q3
++ add r2, r0, #32
++ vadd.i16 q8, q9
++ lsl r3, #1
++ vadd.i16 q10, q11
++ vadd.u16 q0, q2
++ vadd.u16 q8, q10
++ vadd.i16 q0, q8
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d4, d0, d0 @ 1 (all the same)
++ vpadd.i16 d5, d0, d0
++ vrshr.u16 q0, q2, #6
++ vrshr.u16 q1, q2, #6
++
++ @ Store
++1:
++ vst1.16 {q0-q1}, [r0], r3
++ subs r1, #1
++ vst1.16 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+@@ -0,0 +1,881 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ All functions have the call
++@
++@ int ff_hevc_rpi_intra_filter_N_neon_PW(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++@
++@ Assumptions:
++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
++@ if reuseing this code)
++@
++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
++@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore
++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
++@
++@ We always have at least 64 pixel H frame width rounding - this lets us
++@ load UR widthout having to worry about exactly how many pixels are actually
++@ within the frame. As partial loads will only occur very occasionally this
++@ should be a win in nearly all cases.
++@
++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
++@ so we do no maths on the contents
++@
++@ No filtering in 32bit fns as they are chroma only
++
++
++.equ AVAIL_UR, 1
++.equ AVAIL_U, 2
++.equ AVAIL_UL, 4
++.equ AVAIL_L, 8
++.equ AVAIL_DL, 16
++
++.equ FILTER_LIGHT, 0x40
++.equ FILTER_STRONG, 0x80
++
++.equ AVAIL_S_UR_N_U_C, 32 - 1
++.equ AVAIL_S_U_N_UL_C, 32 - 2
++.equ AVAIL_S_UL_N_L_C, 32 - 3
++.equ AVAIL_S_L_N_DL_C, 32 - 4
++
++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr
++
++@ On entry
++@ r2 req
++@ r3 avail
++@ [sp, #sp_offset...] args
++@
++@ On Exit:
++@
++@ Extend values:
++@ d_l scalar contains value for L & DL
++@ if DL avail then this is is DL[0] so we don't need to load that
++@ d_ul scalar containing value for UL
++@ d_u scalar containing value for U
++@ d_ur scalar containing value for UR
++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
++@ This means that L-light-filter works even if nreq DL (we never filter
++@ req-DL without req-L, but we do filter req-L without req-DL)
++@ If UR avail then d_ur == a_ur so U-filter good too
++@
++@ Data load pointers (only load if req & avail):
++@ r4 DL + stride
++@ r10 L
++@ r6 U
++@ r5 UR
++@
++@ Others:
++@ r2 req
++@ r7 req & avail
++@ r3 L + stride
++@ r8 DL + stride * 2
++@ r9 stride * 2
++@ cs Load U
++@ mi Load UR
++@
++@ Clobbered:
++@ r12
++
++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
++
++.equ src_l\@, \sp_offset + 0
++.equ src_u\@, \sp_offset + 4
++.equ src_ur\@, \sp_offset + 8
++.equ stride\@, \sp_offset + 12
++.equ pw\@, (1 << \pw_s) @ pel width in bytes
++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes
++
++@ r9 stride
++@ r7 = ab_ul, r6 = a_u, r5 = a_ur
++@ r4 = b_dl, r10 = b_l, r8 = b_u
++
++ ldr r5, [sp, #src_ur\@]
++ lsl r12, r3, #AVAIL_S_U_DL_CPSR
++ ldr r10, [sp, #src_l\@]
++ ldr r9, [sp, #stride\@]
++ ldr r6, [sp, #src_u\@]
++
++ @ This is quite a slow instruction but it replaces
++ @ a decent number of tests that yield a max of 2 flags/op
++ @ It is annoying we can't branch on Q!
++ @ If L navail (ne) then DL must be navail (pl)
++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur
++
++ mov r4, r5
++ sub r7, r10, r9
++ it vs
++ movvs r4, r6
++ add r8, r6, #b_size\@ - pw\@
++ it cs
++ movcs r4, r7
++ ite ne
++ movne r10, r4
++ addeq r4, r7, r9, lsl #\log2_s
++ it cc
++ movcc r7, r10
++ it mi
++ addmi r4, r10, r9, lsl #\log2_s
++ vld1.\d_type {\d_ul}, [r7]
++ itt vc
++ movvc r8, r7
++ movvc r6, r7
++ vld1.\d_type {\d_l }, [r4], r9
++ tst r3, #AVAIL_UR
++ vld1.\d_type {\d_u }, [r6]
++ it eq
++ moveq r5, r8
++ and r7, r2, r3
++ add r8, r4, r9
++ vld1.\d_type {\d_ur}, [r5]
++ lsls r12, r7, #AVAIL_S_UR_N_U_C
++ add r3, r10, r9
++ lsl r9, #1
++.endm
++
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_8(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 0
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_8, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
++
++ it cs
++ vldrcs s2, [r6]
++ ite pl
++ vmovpl s3, s4
++ vldrmi s3, [r5]
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ add r12, r0, #-pw
++ bpl 1f
++
++ vld1.8 {d0[0]}, [r10], r9
++ vld1.8 {d0[1]}, [r3], r9
++ vld1.8 {d0[2]}, [r10]
++ vld1.8 {d0[3]}, [r3]
++1:
++ bcc 1f
++ vld1.8 {d0[5]}, [r4], r9
++ vld1.8 {d0[6]}, [r8]
++ vld1.8 {d0[7]}, [r4]
++1:
++ vstr d1, [r1] @ Up
++ vst1.8 {d31[7]}, [r12]
++ vstr d0, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
++
++ it cs
++ vldrcs d2, [r6]
++ it mi
++ vldrmi d3, [r5]
++ lsls r7, #AVAIL_S_L_N_DL_C
++ add r12, r0, #-pw
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10]
++ vld1.16 {d0[3]}, [r3]
++1:
++ bcc 1f
++ vld1.16 {d1[1]}, [r4], r9
++ vld1.16 {d1[2]}, [r8]
++ vld1.16 {d1[3]}, [r4]
++1:
++ vst1.16 {q1}, [r1] @ Up
++ vst1.16 {d31[3]}, [r12]
++ vst1.16 {q0}, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_8(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 0
++.set pw, (1 << pw_s)
++.set log2_s, 3
++
++function ff_hevc_rpi_intra_filter_8_neon_8, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
++
++ it cs
++ vldrcs d4, [r6]
++ it mi
++ vldrmi d5, [r5]
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ bpl 1f
++ vld1.8 {d0[0]}, [r10], r9
++ vld1.8 {d0[1]}, [r3], r9
++ vld1.8 {d0[2]}, [r10], r9
++ vld1.8 {d0[3]}, [r3], r9
++ vld1.8 {d0[4]}, [r10], r9
++ vld1.8 {d0[5]}, [r3], r9
++ vld1.8 {d0[6]}, [r10]
++ vld1.8 {d0[7]}, [r3]
++1:
++ bcc 1f
++ vld1.8 {d1[1]}, [r4], r9
++ vld1.8 {d1[2]}, [r8], r9
++ vld1.8 {d1[3]}, [r4], r9
++ vld1.8 {d1[4]}, [r8], r9
++ vld1.8 {d1[5]}, [r4], r9
++ vld1.8 {d1[6]}, [r8]
++ vld1.8 {d1[7]}, [r4]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ @ Luma light filter
++ vext.8 q8, q15, q2, #15
++ vext.8 q12, q15, q0, #15
++ vaddl.u8 q9, d17, d5
++ vaddl.u8 q8, d16, d4
++ vaddl.u8 q13, d25, d1
++ vaddl.u8 q12, d24, d0
++ vmov.u8 r3, d5[7] @ Save final pel
++ vmov.u8 r2, d1[7] @ Save final pel
++
++ vext.16 q2, q8, q9, #1
++ vext.16 q3, q9, q9, #1
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q13, #1
++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q2, q8
++ vadd.u16 q3, q9
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++
++ vrshrn.u16 d4, q2, #2
++ vrshrn.u16 d5, q3, #2
++ vrshrn.u16 d0, q0, #2
++ vrshrn.u16 d1, q1, #2
++ vrshr.u16 d30, #2
++ vmov.u8 d5[7], r3 @ Restore final pel
++ vmov.u8 d1[7], r2 @ Restore final pel
++ vdup.u8 d31, d30[0] @ d31[3] = d30[0]
++
++10:
++ vst1.8 {q2 }, [r1] @ Up
++ vst1.8 {d31[7]}, [r12] @ Up-left
++ vst1.8 {q0 }, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 3
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
++
++ it cs
++ vldmcs r6, {d4, d5}
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #4
++ vldm r5, {d6, d7}
++ bgt 1f
++ vdup.16 d7, d6[3]
++1:
++ lsls r12, r7, #AVAIL_S_L_N_DL_C
++ vdup.16 q1, d0[0]
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10], r9
++ vld1.16 {d0[3]}, [r3], r9
++ vld1.16 {d1[0]}, [r10], r9
++ vld1.16 {d1[1]}, [r3], r9
++ vld1.16 {d1[2]}, [r10]
++ vld1.16 {d1[3]}, [r3]
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vld1.16 {d2[1]}, [r4], r9
++ cmp r12, #p_size
++ vld1.16 {d2[2]}, [r8], r9
++ vld1.16 {d2[3]}, [r4], r9
++ blt 2f
++ vld1.16 {d3[0]}, [r8], r9
++ vld1.16 {d3[1]}, [r4], r9
++ vld1.16 {d3[2]}, [r8]
++ vld1.16 {d3[3]}, [r4]
++ b 1f
++2:
++ vdup.16 d3, d2[3]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ @ Luma light filter
++ vext.16 q9, q2, q3, #7
++ vext.16 q8, q15, q2, #7
++ vext.16 q13, q0, q1, #7
++ vext.16 q12, q15, q0, #7
++ vadd.u16 q9, q3
++ vadd.u16 q8, q2
++ vadd.u16 q13, q1
++ vadd.u16 q12, q0
++ vmov.u16 r3, d7[3] @ Save final pel
++ vmov.u16 r2, d3[3] @ Save final pel
++
++ vext.16 q2, q8, q9, #1
++ vext.16 q3, q9, q9, #1
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q13, #1
++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q2, q8
++ vadd.u16 q3, q9
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++
++ vrshr.u16 q2, #2
++ vrshr.u16 q3, #2
++ vrshr.u16 q0, #2
++ vrshr.u16 q1, #2
++ vrshr.u16 d30, #2
++ vmov.u16 d7[3], r3 @ Restore final pel
++ vmov.u16 d3[3], r2 @ Restore final pel
++ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
++
++10:
++ vst1.16 {q2, q3}, [r1] @ Up
++ vst1.16 {d31[3]}, [r12] @ Up-left
++ vst1.16 {q0, q1}, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_16_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 4
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
++
++ vdup.16 q9, d16[0]
++ vdup.16 q11, d20[0]
++
++ it cs
++ vldmcs r6, {d16-d19}
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #12
++ @ Given chroma frame layout, if UR exists then it is always legit to
++ @ load all of it even if most of it is outside the frame.
++ vldm r5, {d20-d23}
++ bgt 1f
++ bge 4f
++ cmp r12, #8
++ bge 3f
++ vdup.16 d21, d20[3]
++3: vdup.16 d22, d21[3]
++4: vdup.16 d23, d22[3]
++
++1:
++ lsls r7, #AVAIL_S_L_N_DL_C
++ ldr r12, [sp, #dl_size]
++ vdup.16 q1, d0[0]
++ vdup.16 q2, d0[0]
++ vdup.16 q3, d0[0]
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10], r9
++ vld1.16 {d0[3]}, [r3], r9
++ vld1.16 {d1[0]}, [r10], r9
++ vld1.16 {d1[1]}, [r3], r9
++ vld1.16 {d1[2]}, [r10], r9
++ vld1.16 {d1[3]}, [r3], r9
++ vld1.16 {d2[0]}, [r10], r9
++ vld1.16 {d2[1]}, [r3], r9
++ vld1.16 {d2[2]}, [r10], r9
++ vld1.16 {d2[3]}, [r3], r9
++ vld1.16 {d3[0]}, [r10], r9
++ vld1.16 {d3[1]}, [r3], r9
++ vld1.16 {d3[2]}, [r10]
++ vld1.16 {d3[3]}, [r3]
++1:
++ bcc 1f
++ vld1.16 {d4[1]}, [r4], r9
++ cmp r12, #4
++ vld1.16 {d4[2]}, [r8], r9
++ vld1.16 {d4[3]}, [r4], r9
++ ble 2f
++ vld1.16 {d5[0]}, [r8], r9
++ vld1.16 {d5[1]}, [r4], r9
++ cmp r12, #12
++ vld1.16 {d5[2]}, [r8], r9
++ vld1.16 {d5[3]}, [r4], r9
++ blt 3f
++ vld1.16 {d6[0]}, [r8], r9
++ vld1.16 {d6[1]}, [r4], r9
++ vld1.16 {d6[2]}, [r8], r9
++ vld1.16 {d6[3]}, [r4], r9
++ ble 4f
++ vld1.16 {d7[0]}, [r8], r9
++ vld1.16 {d7[1]}, [r4], r9
++ vld1.16 {d7[2]}, [r8]
++ vld1.16 {d7[3]}, [r4]
++ b 1f
++2: vdup.16 d5, d4[3]
++3: vdup.16 d6, d5[3]
++4: vdup.16 d7, d6[3]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ vpush {q5}
++ @ Luma light filter
++ @ Left
++ vext.16 q5, q2, q3, #7
++ vext.16 q14, q1, q2, #7
++ vext.16 q13, q0, q1, #7
++ vext.16 q12, q15, q0, #7
++
++ vadd.u16 q5, q3
++ vadd.u16 q14, q2
++ vadd.u16 q13, q1
++ vadd.u16 q12, q0
++ vmov.u16 r2, d7[3] @ Save final pel
++
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q14, #1
++ vext.16 q2, q14, q5, #1
++ vext.16 q3, q5, q5, #1
++
++ vmov d30, d24 @ d30[0] = l[0] + ul
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++ vadd.u16 q2, q14
++ vadd.u16 q3, q5
++
++ vrshr.u16 q0, #2
++ vrshr.u16 q1, #2
++ vrshr.u16 q2, #2
++ vrshr.u16 q3, #2
++
++ @ Up
++ vext.16 q5, q10, q11, #7
++ vext.16 q14, q9, q10, #7
++ vext.16 q13, q8, q9, #7
++ vext.16 q12, q15, q8, #7
++
++ vadd.u16 q5, q11
++ vadd.u16 q14, q10
++ vadd.u16 q13, q9
++ vadd.u16 q12, q8
++ vmov.u16 r3, d23[3] @ Save final pel
++
++ vext.16 q8, q12, q13, #1
++ vext.16 q9, q13, q14, #1
++ vext.16 q10, q14, q5, #1
++ vext.16 q11, q5, q5, #1
++
++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q8, q12
++ vadd.u16 q9, q13
++ vadd.u16 q10, q14
++ vadd.u16 q11, q5
++
++ vrshr.u16 q8, #2
++ vrshr.u16 q9, #2
++ vrshr.u16 q10, #2
++ vrshr.u16 q11, #2
++
++ @ Misc
++ vrshr.u16 d30, #2
++ vmov.u16 d7[3], r2 @ Restore final pel
++ vmov.u16 d23[3], r3 @ Restore final pel
++ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
++ vpop {q5}
++
++10:
++ vstm r1, {d16-d23} @ Up
++ vst1.16 {d31[3]}, [r12] @ Up-left
++ vstm r0, { d0-d7 } @ Left
++ pop {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_4_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
++
++ it cs
++ vldmcs r6, {d4, d5}
++ it mi
++ vldmmi r5, {d6, d7}
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q1, d0[0]
++ add r12, r0, #-pw
++ bpl 1f
++ vld1.32 {d0[0]}, [r10], r9
++ vld1.32 {d0[1]}, [r3], r9
++ vld1.32 {d1[0]}, [r10]
++ vld1.32 {d1[1]}, [r3]
++1:
++ bcc 1f
++ vld1.32 {d2[1]}, [r4], r9
++ vld1.32 {d3[0]}, [r8]
++ vld1.32 {d3[1]}, [r4]
++1:
++ vst1.32 {q2, q3 }, [r1] @ Up
++ vst1.32 {d31[1]}, [r12]
++ vst1.32 {q0, q1 }, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 3
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
++
++ vdup.32 q9, d16[0]
++ vdup.32 q11, d20[0]
++
++ it cs
++ vldmcs r6, {q8, q9 }
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #p_size
++ vldm r5, {q10, q11}
++ bge 1f
++ vdup.32 q11, d21[1]
++1:
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q1, d0[0]
++ vdup.32 q2, d0[0]
++ vdup.32 q3, d0[0]
++ bpl 1f
++ vld1.32 {d0[0]}, [r10], r9
++ vld1.32 {d0[1]}, [r3], r9
++ vld1.32 {d1[0]}, [r10], r9
++ vld1.32 {d1[1]}, [r3], r9
++ vld1.32 {d2[0]}, [r10], r9
++ vld1.32 {d2[1]}, [r3], r9
++ vld1.32 {d3[0]}, [r10]
++ vld1.32 {d3[1]}, [r3]
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vld1.32 {d4[1]}, [r4], r9
++ cmp r12, #p_size
++ vld1.32 {d5[0]}, [r8], r9
++ vld1.32 {d5[1]}, [r4], r9
++ blt 2f
++ vld1.32 {d6[0]}, [r8], r9
++ vld1.32 {d6[1]}, [r4], r9
++ vld1.32 {d7[0]}, [r8]
++ vld1.32 {d7[1]}, [r4]
++ b 1f
++2:
++ vdup.32 q3, d5[1]
++1:
++ add r12, r0, #-pw
++ vstm r1, { q8-q11} @ Up
++ vst1.32 {d31[1]}, [r12]
++ vstm r0, { q0-q3 } @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_16_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 4
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
++
++ @ Once we get this big we have run out of neon regs to store
++ @ everything at once so do in pieces
++
++ @ Up (have)
++ it cs
++ vldmcs r6, { q0-q3 }
++ ldr r12, [sp, #ur_size]
++ it mi
++ vldmmi r5, { q8-q11}
++ it cs
++ vstmcs r1, { q0-q3 }
++ bpl 1f
++ cmp r12, #12
++ add lr, r1, #(pw << log2_s)
++ bgt 2f
++ cmp r12, #8
++ bge 3f
++ vdup.16 q9, d17[1]
++4: vdup.16 d10, d19[1]
++3: vdup.16 q11, d21[1]
++2: vstm lr, { q8-q11}
++1:
++
++ @ Left (have)
++ add lr, r0, #-pw
++ lsls r12, r7, #AVAIL_S_L_N_DL_C
++ vst1.32 {d30[1]}, [lr] @ UL
++ bpl 1f
++ vld1.32 { d0[0]}, [r10], r9
++ vld1.32 { d0[1]}, [r3], r9
++ vld1.32 { d1[0]}, [r10], r9
++ vld1.32 { d1[1]}, [r3], r9
++ vld1.32 { d2[0]}, [r10], r9
++ vld1.32 { d2[1]}, [r3], r9
++ vld1.32 { d3[0]}, [r10], r9
++ vld1.32 { d3[1]}, [r3], r9
++ vld1.32 { d4[0]}, [r10], r9
++ vld1.32 { d4[1]}, [r3], r9
++ vld1.32 { d5[0]}, [r10], r9
++ vld1.32 { d5[1]}, [r3], r9
++ vld1.32 { d6[0]}, [r10], r9
++ vld1.32 { d6[1]}, [r3], r9
++ vld1.32 { d7[0]}, [r10]
++ vld1.32 { d7[1]}, [r3]
++ vstm r0, { q0-q3 }
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vdup.32 d16, d30[0] @ d16[0] = d30[0]
++ add lr, r0, #(pw << log2_s)
++ vld1.32 {d16[1]}, [r4], r9
++ cmp r12, #4
++ vld1.32 {d17[0]}, [r8], r9
++ vld1.32 {d17[1]}, [r4], r9
++ ble 2f
++ vld1.32 {d18[0]}, [r8], r9
++ vld1.32 {d18[1]}, [r4], r9
++ cmp r12, #12
++ vld1.32 {d19[0]}, [r8], r9
++ vld1.32 {d19[1]}, [r4], r9
++ blt 3f
++ vld1.32 {d20[0]}, [r8], r9
++ vld1.32 {d20[1]}, [r4], r9
++ vld1.32 {d21[0]}, [r8], r9
++ vld1.32 {d21[1]}, [r4], r9
++ ble 4f
++ vld1.32 {d22[0]}, [r8], r9
++ vld1.32 {d22[1]}, [r4], r9
++ vld1.32 {d23[0]}, [r8]
++ vld1.32 {d23[1]}, [r4]
++ b 5f
++2: vdup.32 q9, d17[1]
++3: vdup.32 q10, d19[1]
++4: vdup.32 q11, d21[1]
++5: vstm lr, { q8-q11}
++1:
++ eors r7, r2
++ beq 99f
++
++ lsls r12, r7, #AVAIL_S_UR_N_U_C
++ vdup.32 q0, d31[0]
++ vdup.32 q1, d31[0]
++ vdup.32 q2, d31[0]
++ vdup.32 q3, d31[0]
++ add lr, r1, #(pw << log2_s)
++ vdup.32 q8, d31[1]
++ vdup.32 q9, d31[1]
++ vdup.32 q10, d31[1]
++ vdup.32 q11, d31[1]
++ it cs
++ vstmcs r1, { q0-q3 }
++ it mi
++ vstmmi lr, { q8-q11}
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q0, d30[0]
++ vdup.32 q1, d30[0]
++ vdup.32 q2, d30[0]
++ vdup.32 q3, d30[0]
++ add lr, r0, #(pw << log2_s)
++ it mi
++ vstmmi r0, { q0-q3 }
++ it cs
++ vstmcs lr, { q0-q3 }
++
++99:
++ pop {r4-r10, pc}
++endfunc
++
++
++
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+@@ -0,0 +1,920 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * Horizontal & Vertical special cases of angular intra pred
++ *
++ * Split out because:
++ * Vertical, at least, is relatively common
++ * Much simpler code than the general angular case
++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else
++ *
++ * *** Currently luma filtering is mandatory where it occurs, but there are
++ * cases where it should be turned off (rdpcm & an extension sps flag).
++ * These don't occur in the standard conformance suite for Main Profile
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ ff_hevc_rpi_pred_vertical_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.32 {d0[0]}, [r2 :32] @ Left
++ add r2, r0, r3
++ vld1.8 {d1[]}, [r1]
++ lsl r3, #1
++ vdup.8 d4, ip
++ vmov.i8 d2, #128
++ vhsub.u8 d4, d0, d4
++ veor d1, d2
++ vld1.32 {d0[0]}, [r1 :32] @ Top
++ vqadd.s8 d1, d4
++ vmov.i64 d3, #0xff
++ vmov d4, d0
++ veor d5, d1, d2
++ veor d1, d1, d2
++ vbit d0, d1, d3
++ vshr.u64 d5, #8
++ vst1.32 {d0[0]}, [r0], r3
++ vshr.u64 d1, #16
++ vbit d4, d5, d3
++ vshr.u64 d5, #16
++ vst1.32 {d4[0]}, [r2], r3
++ vbit d0, d1, d3
++ vst1.32 {d0[0]}, [r0]
++ vbit d4, d5, d3
++ vst1.32 {d4[0]}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {d0}, [r2 :64] @ Left
++ vmov.i8 d1, #128
++ vld1.8 {d2[]}, [r1]
++ vld1.8 {d3}, [r1 :64] @ Top
++ vdup.8 d4, ip
++ vhsub.u8 d4, d0, d4
++ veor d2, d1
++ vmov.i64 d0, #0xff
++ mov r1, #8
++ vqadd.s8 d2, d4, d2
++ veor d1, d2, d1
++1:
++ vbit d3, d1, d0
++ vshr.u64 d1, #8
++ vst1.8 {d3}, [r0 :64], r3
++ subs r1, #2
++ vbit d3, d1, d0
++ vshr.u64 d1, #8
++ vst1.8 {d3}, [r0 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {q0}, [r2 :128] @ Left
++ vdup.8 q1, ip
++ vld1.8 {d4[],d5[]}, [r1]
++ vhsub.u8 q0, q1
++ vmov.i8 q1, #128
++ veor q2, q1
++ vmov.i64 d16, #0xff
++ vqadd.s8 q0, q2
++ vld1.8 {q3}, [r1 :128] @ Top
++ mov r1, #16
++ veor q0, q1
++ vmov q1, q3
++ vext.8 q2, q0, q0, #1
++1:
++ vbit d2, d0, d16
++ vbit d6, d4, d16
++ vext.8 q0, q0, q0, #2
++ subs r1, #2
++ vst1.8 {q1}, [r0 :128], r3
++ vext.8 q2, q2, q2, #2
++ vst1.8 {q3}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vert_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
++ vld1.8 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3
++ lsl r3, #1
++ mov r1, #16
++1:
++ vst1.8 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.8 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d0 }, [r2 :64], r3
++ vst1.16 {d0 }, [r0 :64]
++ vst1.16 {d0 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #4
++1:
++ vst1.16 {q0 }, [r0 :128], r3
++ subs r1, #2
++ vst1.16 {q0 }, [r2 :128], r3
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q0 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #8
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++@ ? Might be faster as simple arm
++
++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.32 {d0[0]}, [r1 :32] @ Top
++ add r1, r2, #3
++ vld1.8 {d1[]}, [r2]!
++ vdup.8 d2, ip
++ vmov.i8 d3, #128
++ vhsub.u8 d0, d2
++ veor d1, d3
++ vld1.8 {d2[]}, [r2]!
++ add ip, r0, r3
++ vqadd.s8 d0, d0, d1
++ lsl r3, #1
++ vld1.8 {d1[]}, [r2]
++ vld1.8 {d4[]}, [r1]
++ veor d0, d3
++ vst1.32 {d0[0]}, [r0 :32], r3
++ vst1.32 {d2[0]}, [ip :32], r3
++ vst1.32 {d1[0]}, [r0 :32]
++ vst1.32 {d4[0]}, [ip :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {d0}, [r1 :64] @ Top
++ vmov.i8 d1, #128
++ vld1.8 {d2[]}, [r2]!
++ mov r1, #8-2
++ vdup.8 d3, ip
++ vhsub.u8 d0, d3
++ veor d2, d1
++ vqadd.s8 d0, d2
++ vld1.8 {d2[]}, [r2]!
++ veor d0, d1
++ vst1.8 {d0}, [r0], r3
++1:
++ vld1.8 {d0[]}, [r2]!
++ subs r1, #2
++ vst1.8 {d2}, [r0 :64], r3
++ vld1.8 {d2[]}, [r2]!
++ vst1.8 {d0}, [r0 :64], r3
++ bne 1b
++
++ vst1.8 {d2}, [r0 :64]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {q0}, [r1 :64] @ Top
++ mov r1, #16-2
++ vld1.8 {d4[],d5[]}, [r2]!
++ vdup.8 q3, ip
++ vhsub.u8 q0, q3
++ vmov.i8 q1, #128
++ veor q2, q1
++ vqadd.s8 q0, q2
++ vld1.8 {d4[],d5[]}, [r2]!
++ veor q0, q1
++ vst1.8 {q0}, [r0], r3
++1:
++ vld1.8 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.8 {q2}, [r0 :64], r3
++ vld1.8 {d4[],d5[]}, [r2]!
++ vst1.8 {q0}, [r0 :64], r3
++ bne 1b
++
++ vst1.8 {q2}, [r0 :64]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
++ vld1.8 {d0[],d1[]}, [r2]!
++ add ip, r0, #16
++ mov r1, #32-2
++ vld1.8 {d2[],d3[]}, [r2]!
++ vst1.8 {q0}, [r0 :128], r3
++ vst1.8 {q0}, [ip :128], r3
++1:
++ vld1.8 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.8 {q1}, [r0 :128], r3
++ vst1.8 {q1}, [ip :128], r3
++ vld1.8 {d2[],d3[]}, [r2]!
++ vst1.8 {q0}, [r0 :128], r3
++ vst1.8 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.8 {q1}, [r0 :128]
++ vst1.8 {q1}, [ip :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
++ add r1, r2, #2
++ vld1.16 {d0[]}, [r2]
++ add r2, #4
++ vld1.16 {d1[]}, [r1]
++ add r1, #4
++ vld1.16 {d2[]}, [r2]
++A add r2, r0, r3, lsl #1
++T lsl r3, #1
++T add r2, r0, r3
++ vld1.16 {d3[]}, [r1]
++A lsl r3, #2
++T lsl r3, #1
++ vst1.16 {d0}, [r0 :64], r3
++ vst1.16 {d1}, [r2 :64], r3
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d3}, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
++ vld1.16 {d0[],d1[]}, [r2]!
++ lsl r3, #1
++ vld1.16 {d2[],d3[]}, [r2]!
++ mov r1, #8-2
++ vst1.16 {q0}, [r0 :64], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q1}, [r0 :64], r3
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :64], r3
++ bne 1b
++
++ vst1.16 {q1}, [r0 :64]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
++ vld1.16 {d0[],d1[]}, [r2]!
++ lsl r3, #1
++ add ip, r0, #16
++ mov r1, #16-2
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q1}, [r0 :128], r3
++ vst1.16 {q1}, [ip :128], r3
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.16 {q1}, [r0 :128]
++ vst1.16 {q1}, [ip :128]
++ bx lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ 10 Bit
++@ Has clipping constants so 10-bit only but could easily be macroed up to
++@ 14-bit before we run out of bits
++
++
++@ ff_hevc_rpi_pred_vertical_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {d0}, [r2 :64] @ Left
++ vmov.i16 d2, #0
++ vld1.16 {d1[]}, [r1]
++T lsl r3, #1
++ vdup.16 d4, ip
++ vmov.i16 d3, #0x3ff
++ vld1.16 {d5}, [r1 :64] @ Top
++ vhsub.u16 d4, d0, d4
++ vmov.i64 d0, #0xffff
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.i16 d1, d1, d4
++ vmov d6, d5
++ vmax.s16 d1, d1, d2
++ vmin.s16 d2, d1, d3
++ vmin.s16 d1, d1, d3
++ vbit d5, d1, d0
++A lsl r3, #2
++T lsl r3, #1
++ vshr.u64 d2, #16
++ vshr.u64 d1, #32
++ vbit d6, d2, d0
++ vst1.16 {d5}, [r0], r3
++ vshr.u64 d2, #32
++ vst1.16 {d6}, [r2], r3
++ vbit d5, d1, d0
++ vst1.16 {d5}, [r0]
++ vbit d6, d2, d0
++ vst1.16 {d6}, [r2]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0}, [r2 :128] @ Left
++ lsl r3, #1
++ vdup.16 q1, ip
++ vld1.16 {d4[],d5[]}, [r1]
++ vhsub.u16 q0, q0, q1
++ vmov.i16 q1, #0
++ vadd.i16 q0, q2
++ vmov.i16 q2, #0x3ff
++ vld1.16 {q3}, [r1 :128] @ Top
++ mov r1, #8
++ vmax.s16 q0, q1
++ vmov q1, q3
++ vmin.s16 q0, q2
++ vmov.i64 d16, #0xffff
++ vext.16 q2, q0, q0, #1
++1:
++ vbit d2, d0, d16
++ vbit d6, d4, d16
++ vext.16 q0, q0, q0, #2
++ subs r1, #2
++ vst1.16 {q1}, [r0 :128], r3
++ vext.16 q2, q2, q2, #2
++ vst1.16 {q3}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0-q1}, [r2 :128] @ Left
++T lsl r3, #1
++ vdup.16 q2, ip
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vld1.16 {d6[],d7[]}, [r1]
++A lsl r3, #2
++T lsl r3, #1
++ vhsub.u16 q0, q2
++ vhsub.u16 q1, q2
++ vadd.i16 q0, q3
++ vadd.i16 q1, q3
++ vmov.i16 q2, #0
++ vld1.16 {q8-q9}, [r1 :128] @ Top
++ mov r1, #0
++ vmov.i16 q3, #0x3ff
++ vmax.s16 q0, q2
++ vmax.s16 q1, q2
++ vmin.s16 q0, q3
++ vmin.s16 q1, q3
++ vmov q10, q8
++ vmov q11, q9
++ vext.16 q2, q0, q1, #1
++ vext.16 q3, q1, q1, #1
++ vmov.i64 d24, #0xffff
++1:
++ vbit d16, d0, d24
++ vbit d20, d4, d24
++ vext.16 q0, q0, q0, #2
++ subs r1, #1<<30
++ vst1.16 {q8-q9}, [r0 :128], r3
++ vext.16 q2, q2, q2, #2
++ vst1.16 {q10-q11}, [r2 :128], r3
++ bne 1b
++1:
++ vbit d16, d2, d24
++ vbit d20, d6, d24
++ vext.16 q1, q1, q1, #2
++ subs r1, #1<<30
++ vst1.16 {q8-q9}, [r0 :128], r3
++ vext.16 q3, q3, q3, #2
++ vst1.16 {q10-q11}, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ lsl r3, #1
++ mov r1, #32
++ add r2, r0, #32
++1:
++ vst1.16 {q0-q1}, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2-q3}, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q0 }, [r2 :128], r3
++ vst1.16 {q0 }, [r0 :128]
++ vst1.16 {q0 }, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++ mov r1, #4
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ lsl r3, #2
++ mov r1, #16
++ add r2, r0, #32
++1:
++ vst1.16 {q0-q1}, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2-q3}, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++@ ff_hevc_rpi_pred_horizontal_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {d0}, [r1 :64] @ Top
++ vmov.i16 d1, #0
++ vld1.16 {d2[]}, [r2]!
++T lsl r3, #1
++ vdup.16 d3, ip
++ vmov.i16 d4, #0x3ff
++ vhsub.u16 d0, d3
++A add ip, r0, r3, lsl #1
++T add ip, r0, r3
++ vld1.16 {d3[]}, [r2]!
++A lsl r3, #2
++T lsl r3, #1
++ vadd.i16 d0, d2
++ vld1.16 {d2[]}, [r2]!
++ vmax.s16 d0, d1
++ vld1.16 {d1[]}, [r2]
++ vmin.s16 d0, d4
++ vst1.16 {d0}, [r0 :64], r3
++ vst1.16 {d3}, [ip :64], r3
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d1}, [ip :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0}, [r1 :128] @ Top
++ lsl r3, #1
++ vdup.16 q1, ip
++ mov r1, #8-2
++ vhsub.u16 q0, q1
++ vld1.16 {d2[],d3[]}, [r2]!
++ vmov.i16 q2, #0
++ vadd.i16 q0, q1
++ vmov.i16 q1, #0x3ff
++ vmax.s16 q0, q2
++ vld1.16 {d4[],d5[]}, [r2]!
++ vmin.s16 q0, q1
++ vst1.16 {q0}, [r0 :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q2}, [r0 :128], r3
++ vld1.16 {d4[],d5[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ bne 1b
++
++ vst1.16 {q2}, [r0 :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0-q1}, [r1 :128] @ Top
++ lsl r3, #1
++ vdup.16 q2, ip
++ add ip, r0, r3
++ vhsub.u16 q0, q2
++ add ip, #16
++ vhsub.u16 q1, q2
++ mov r1, #16-2
++ vld1.16 {d4[],d5[]}, [r2]!
++ vmov.i16 q3, #0
++ vadd.u16 q0, q2
++ vadd.i16 q1, q2
++ vmov.i16 q2, #0x3ff
++ vmax.s16 q0, q3
++ vmax.s16 q1, q3
++ vld1.16 {d6[],d7[]}, [r2]!
++ vmin.s16 q0, q2
++ vmin.s16 q1, q2
++ vst1.16 {q0-q1}, [r0 :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q3}, [r0 :128], r3
++ vst1.16 {q3}, [ip :128], r3
++ vld1.16 {d6[],d7[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.16 {q3}, [r0 :128]
++ vst1.16 {q3}, [ip :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
++ vld1.16 {d0[],d1[]}, [r2]!
++ add ip, r0, #16
++ push {lr}
++ mov lr, #32
++ vld1.16 {d2[],d3[]}, [r2]!
++ lsl r3, #1
++ vst1.16 {q0}, [r0 :128], lr
++ sub r3, #32
++ vst1.16 {q0}, [ip :128], lr
++ mov r1, #32-2
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q1}, [r0 :128], lr
++ vst1.16 {q1}, [ip :128], lr
++ vst1.16 {q1}, [r0 :128], r3
++ vst1.16 {q1}, [ip :128], r3
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], lr
++ vst1.16 {q0}, [ip :128], lr
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.16 {q1}, [r0 :128], lr
++ vst1.16 {q1}, [ip :128], lr
++ vst1.16 {q1}, [r0 :128]
++ vst1.16 {q1}, [ip :128]
++ pop {pc}
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
++ add r1, r2, #4
++ vld1.32 {d0[],d1[]}, [r2]
++ add r2, #8
++ vld1.32 {d2[],d3[]}, [r1]
++ add r1, #8
++ vld1.32 {d4[],d5[]}, [r2]
++A add r2, r0, r3, lsl #2
++T lsl r3, #2
++T add r2, r0, r3
++ vld1.32 {d6[],d7[]}, [r1]
++A lsl r3, #3
++T lsl r3, #1
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q1}, [r2 :128], r3
++ vst1.32 {q2}, [r0 :128]
++ vst1.32 {q3}, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
++ vld1.32 {d0[],d1[]}, [r2]!
++ lsl r3, #2
++ add ip, r0, #16
++ mov r1, #8-2
++ vld1.32 {d2[],d3[]}, [r2]!
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++1:
++ vld1.32 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.32 {q1}, [r0 :128], r3
++ vst1.32 {q1}, [ip :128], r3
++ vld1.32 {d2[],d3[]}, [r2]!
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.32 {q1}, [r0 :128]
++ vst1.32 {q1}, [ip :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
++ vld1.32 {d0[],d1[]}, [r2]!
++ add ip, r0, #16
++ push {lr}
++ mov lr, #32
++ vld1.32 {d2[],d3[]}, [r2]!
++ lsl r3, #2
++ vst1.32 {q0}, [r0 :128], lr
++ sub r3, #32
++ vst1.32 {q0}, [ip :128], lr
++ mov r1, #16-2
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++1:
++ vld1.32 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.32 {q1}, [r0 :128], lr
++ vst1.32 {q1}, [ip :128], lr
++ vst1.32 {q1}, [r0 :128], r3
++ vst1.32 {q1}, [ip :128], r3
++ vld1.32 {d2[],d3[]}, [r2]!
++ vst1.32 {q0}, [r0 :128], lr
++ vst1.32 {q0}, [ip :128], lr
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.32 {q1}, [r0 :128], lr
++ vst1.32 {q1}, [ip :128], lr
++ vst1.32 {q1}, [r0 :128]
++ vst1.32 {q1}, [ip :128]
++ pop {pc}
++endfunc
++
++
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+@@ -0,0 +1,1043 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ Planar intra pred (8.4.4.2.4)
++@
++@ predSamples[ x ][ y ] =
++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
++@ ( x + 1 ) * p[ nTbS ][ -1 ] +
++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] +
++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
++
++@ All 10-bit functions would work with 9
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_8, export=1
++
++ vld1.8 {d0}, [r1] @ Top
++ adr ip, nb_3_0_1_4
++ vld1.8 {d1}, [r2] @ Left
++ vmov.i64 d2, #0xffffffff
++ vldr d3, [ip, #8] @ {1,2,3,4,1,2,3,4}
++ add r1, r0, r3
++ vdup.32 d4, d0[0] @ {t0,t1,t2,t3,t0,t1,t2,t3}
++ vdup.8 d0, d0[4] @ {t4,t4,t4,t4,t4,t4,t4,t4}
++ vdup.8 d5, d1[4] @ {l4,l4,l4,l4,l4,l4,l4,l4}
++ vdup.8 d6, d1[0] @ {l0,l0,l0,l0,l0,l0,l0,l0}
++ vshll.u8 q8, d4, #2
++ lsl r3, #1
++ vsubl.u8 q2, d5, d4
++ vmlal.u8 q8, d0, d3
++ vld1.8 {d0}, [ip] @ {3,2,1,0,3,2,1,0}
++ vdup.8 d7, d1[1] @ {l1,l1,l1,l1,l1,l1,l1,l1}
++ vshl.s16 q9, q2, #1
++ vbif d6, d7, d2 @ {l0,l0,l0,l0,l1,l1,l1,l1}
++ vadd.i16 d16, d4
++ vdup.8 d7, d1[2] @ {l2,l2,l2,l2,l2,l2,l2,l2}
++ vadd.i16 d17, d18
++ vdup.8 d1, d1[3] @ {l3,l3,l3,l3,l3,l3,l3,l3}
++ vadd.i16 q2, q8, q9
++ vmlal.u8 q8, d0, d6
++ vbif d7, d1, d2 @ {l2,l2,l2,l2,l3,l3,l3,l3}
++ vmlal.u8 q2, d0, d7
++ vrshrn.i16 d0, q8, #3
++ vst1.32 d0[0], [r0 :32], r3
++ vst1.32 d0[1], [r1 :32], r3
++ vrshrn.i16 d0, q2, #3
++ vst1.32 d0[0], [r0 :32]
++ vst1.32 d0[1], [r1 :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ vld1.16 {q0}, [r1 :64] @ Top
++ adr ip, nbh_3_0_1_4
++ vldr d2, [r2, #8] @ Left (lower)
++ vldr d3, [ip, #8] @ {1,2,3,4}
++T lsl r3, #1
++ vshl.s16 d4, d0, #2
++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4}
++ vldr d5, [r2] @ Left (upper)
++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4}
++ vldr d6, [ip] @ {3,2,1,0}
++ vmla.i16 d4, d3, d1 @ Acc set up
++ vsub.i16 d0, d2, d0 @ Add set up
++ vmov d7, d6
++ vdup.16 d2, d5[0]
++ vdup.16 d3, d5[1]
++ vdup.16 d16, d5[2]
++ vadd.i16 d18, d0, d4
++ vshl.s16 d0, #1 @ x2
++ vadd.i16 d19, d0, d4
++ vdup.16 d17, d5[3]
++ vadd.i16 d4, d0, d18
++A add r1, r0, r3, lsl #1
++T add r1, r0, r3
++ vadd.i16 d5, d0, d19
++A lsl r3, #2
++T lsl r3, #1
++ vmla.i16 q9, q1, q3
++ vmla.i16 q2, q8, q3
++ vrshr.u16 q0, q9, #3
++ vst1.16 {d0}, [r0], r3
++ vrshr.u16 d2, d4, #3
++ vst1.16 {d1}, [r1], r3
++ vrshr.u16 d3, d5, #3
++ vst1.16 {d2}, [r0]
++ vst1.16 {d3}, [r1]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_8, export=1
++
++ vld1.8 {q0}, [r1] @ Top
++ adr ip, nb_7_0_1_8
++ vldr d2, [r2, #8] @ Left (lower)
++ mov r1, #8
++ vldr d3, [ip, #8] @ {1,2,3,4,5,6,7,8}
++ vshll.u8 q2, d0, #3
++ vdup.8 d1, d1[0] @ {t8,t8,t8,t8,t8,t8,t8,t8}
++ vdup.8 d2, d2[0] @ {l8,l8,l8,l8,l8,l8,l8,l8}
++ vldr d6, [r2] @ Left (upper)
++ vmlal.u8 q2, d3, d1
++ vsubl.u8 q0, d2, d0
++ vldr d7, [ip] @ {7,6,5,4,3,2,1,0}
++
++@ u8 7..0 [1] d7
++@ u8 left[y] [1] d6
++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1]
++
++ vdup.8 d2, d6[0]
++ vadd.i16 q2, q0
++ vdup.8 d3, d6[1]
++ vadd.i16 q8, q2, q0
++1:
++ vmlal.u8 q2, d7, d2
++ subs r1, #2
++ vadd.i16 q9, q8, q0
++ vmlal.u8 q8, d7, d3
++ vdup.8 d2, d6[2]
++ vdup.8 d3, d6[3]
++ vrshrn.i16 d20, q2, #4
++ vshr.u64 d6, #16
++ vmov q2, q9
++ vst1.8 {d20}, [r0], r3
++ vrshrn.i16 d20, q8, #4
++ vadd.i16 q8, q2, q0
++ vst1.8 {d20}, [r0], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_10, export=1
++
++ adr ip, nb_7_0_1_8
++ vld1.16 {q0}, [r1 :128]! @ Top (left)
++ lsl r3, #1
++ vld1.16 {q1}, [ip :128] @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
++ add ip, r2, #16
++ vld1.16 {d4[],d5[]}, [r1] @ Top (right)
++ mov r1, #8-2
++ vshl.s16 q3, q0, #3
++ vmovl.u8 q8, d3 @ {1,2,3,4,5,6,7,8}
++ vld1.16 {d18[],d19[]}, [ip] @ Left (lower)
++ vmla.i16 q3, q8, q2 @ Acc set up
++ vsub.i16 q0, q9, q0 @ Add set up
++ vmovl.u8 q1, d2 @ {7,6,5,4,3,2,1,0}
++ vadd.i16 q2, q3, q0
++
++@ u16 7..0 [1] q1
++@ u32 left[y] [1] [r2]
++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1]
++
++ vld1.16 {d6[],d7[]}, [r2]!
++ vadd.i16 q8, q2, q0
++ vld1.16 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++1:
++ vrshr.u16 q9, q2, #4
++ subs r1, #2
++ vmov q2, q3
++ vrshr.u16 q10, q8, #4
++ vld1.16 {d6[],d7[]}, [r2]!
++ vst1.16 {q9}, [r0 :128], r3
++ vadd.i16 q8, q2, q0
++ vld1.16 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++ vst1.16 {q10}, [r0 :128], r3
++ bne 1b
++
++ vrshr.u16 q9, q2, #4
++ add r3, r0
++ vrshr.u16 q10, q8, #4
++ vst1.16 {q9}, [r0 :128]
++ vst1.16 {q10}, [r3 :128]
++
++ bx lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++ .balign 64
++
++nb_31_0_1_32:
++ .byte 31, 30, 29, 28, 27, 26, 25, 24
++ .byte 23, 22, 21, 20, 19, 18, 17, 16
++nb_15_0_1_16:
++ .byte 15, 14, 13, 12, 11, 10, 9, 8
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++ .byte 9, 10, 11, 12, 13, 14, 15, 16
++ .byte 17, 18, 19, 20, 21, 22, 23, 24
++ .byte 25, 26, 27, 28, 29, 30, 31, 32
++
++ @ should be back on a 64-byte boundary here
++
++ @ These could be extracted from the above array, but separate out
++ @ out for better (16 byte) alignment
++nb_3_0_1_4:
++ .byte 3, 2, 1, 0, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 1, 2, 3, 4
++nb_7_0_1_8:
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++nbh_3_0_1_4:
++ .short 3, 2, 1, 0, 1, 2, 3, 4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_8, export=1
++
++ adr ip, nb_15_0_1_16 + 16
++ vld1.8 {q0}, [r1 :128]! @ Top (left)
++ add r2, #16
++ vld1.8 {q1}, [ip: 128] @ {1,2,3...16}
++ vld1.8 {d4[]}, [r1] @ Top (right)
++ sub ip, #16
++ vshll.u8 q3, d0, #4
++ mov r1, #16
++ vshll.u8 q8, d1, #4
++ vld1.8 {d5[]}, [r2] @ Left (lower)
++ sub r2, #16
++ vmlal.u8 q3, d2, d4
++ vmlal.u8 q8, d3, d4 @ Acc set up
++ vsubl.u8 q1, d5, d0
++ vsubl.u8 q0, d5, d1 @ Add set up
++ vld1.8 {q2}, [ip :128] @ {15,14,13...0}
++
++@ u8 15..0 [1] q2
++@ u8 left[y] [1] [r2]
++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q3, q1
++ vadd.i16 q8, q0
++1:
++ vadd.i16 q10, q3, q1
++ subs r1, #2
++ vld1.8 {d18[]}, [r2]!
++ vadd.i16 q11, q8, q0
++ vld1.8 {d19[]}, [r2]!
++ vmlal.u8 q3, d4, d18
++ vmlal.u8 q8, d5, d18
++ vadd.i16 q12, q10, q1
++ vmlal.u8 q10, d4, d19
++ vadd.i16 q13, q11, q0
++ vmlal.u8 q11, d5, d19
++ vrshrn.u16 d18, q3, #5
++ vrshrn.u16 d19, q8, #5
++ vmov q3, q12
++ vst1.8 {q9}, [r0 :128], r3
++ vrshrn.u16 d18, q10, #5
++ vrshrn.u16 d19, q11, #5
++ vmov q8, q13
++ vst1.8 {q9}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr ip, nb_15_0_1_16 + 16
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ add r2, #32
++ vld1.8 {q2}, [ip :128] @ {1,2,3...16}
++ lsl r3, #1
++ vld1.16 {d6[],d7[]}, [r1] @ Top (right)
++ sub ip, #16
++ vmovl.u8 q8, d4
++ mov r1, #16
++ vshl.i16 q9, q0, #4
++ vmovl.u8 q2, d5
++ vshl.i16 q10, q1, #4
++ vld1.16 {d22[],d23[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vld1.8 {q12}, [ip] @ {15,14,13...0}
++ vmla.i16 q9, q8, q3
++ vmla.i16 q10, q2, q3 @ Acc set up
++ vsub.i16 q0, q11, q0
++ vsub.i16 q1, q11, q1 @ Add set up
++ vadd.i16 q2, q9, q0
++ vadd.i16 q3, q10, q1
++ vmovl.u8 q8, d24
++ vmovl.u8 q9, d25
++
++@ u16 15..0 [2] q8,q9
++@ u32 left[y] [2] [r2]
++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++ vadd.i16 q10, q2, q0
++ subs r1, #2
++ vld1.16 {d24[],d25[]}, [r2]!
++ vadd.i16 q11, q3, q1
++ vld1.16 {d28[],d29[]}, [r2]!
++ vmla.i16 q2, q8, q12
++ vmla.i16 q3, q9, q12
++ vadd.i16 q12, q10, q0
++ vmla.i16 q10, q8, q14
++ vadd.i16 q13, q11, q1
++ vmla.i16 q11, q9, q14
++ vrshr.u16 q14, q2, #5
++ vrshr.u16 q15, q3, #5
++ vmov q2, q12
++ vst1.16 {q14-q15}, [r0 :128], r3
++ vrshr.u16 q14, q10, #5
++ vrshr.u16 q15, q11, #5
++ vmov q3, q13
++ vst1.16 {q14-q15}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_8, export=1
++
++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nb_31_0_1_32 + 32
++ vpush {d8-d12}
++ vld1.8 {q2-q3}, [ip :128] @ {1,2,3...32}
++ add r2, #32
++ vld1.8 {d8[]}, [r1] @ Top (right)
++ sub ip, #32
++ vshll.u8 q8, d0, #5
++ mov r1, #32
++ vld1.8 {d9[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vshll.u8 q9, d1, #5
++ vshll.u8 q10, d2, #5
++ vshll.u8 q11, d3, #5
++ vmlal.u8 q8, d4, d8
++ vsubl.u8 q12, d9, d0
++ vmlal.u8 q9, d5, d8
++ vsubl.u8 q13, d9, d1
++ vmlal.u8 q10, d6, d8
++ vsubl.u8 q14, d9, d2
++ vmlal.u8 q11, d7, d8 @ Acc set up
++ vsubl.u8 q15, d9, d3 @ Add set up
++ vadd.i16 q8, q12
++ vadd.i16 q9, q13
++ vadd.i16 q10, q14
++ vadd.i16 q11, q15
++ vld1.8 {q4-q5}, [ip :128] @ {31,30,29...0}
++
++@ u8 31..0 [2] q4,q5
++@ u8 left[y] [2] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1]
++
++ vld1.8 {d12[]}, [r2]!
++ vadd.i16 q0, q8, q12
++ b 2f
++1:
++ vld1.8 {d12[]}, [r2]!
++ vrshrn.u16 d3, q1, #6
++ vrshrn.u16 d2, q0, #6
++ vadd.i16 q0, q8, q12
++ vrshrn.u16 d4, q2, #6
++ vrshrn.u16 d5, q3, #6
++ vst1.8 {q1-q2}, [r0 :128], r3
++2: vadd.i16 q1, q9, q13
++ subs r1, #2
++ vadd.i16 q2, q10, q14
++ vadd.i16 q3, q11, q15
++ vmlal.u8 q8, d8, d12
++ vmlal.u8 q9, d9, d12
++ vmlal.u8 q10, d10, d12
++ vmlal.u8 q11, d11, d12
++ vld1.8 {d12[]}, [r2]!
++ vrshrn.u16 d19, q9, #6
++ vrshrn.u16 d18, q8, #6
++ vadd.i16 q8, q0, q12
++ vrshrn.u16 d20, q10, #6
++ vrshrn.u16 d21, q11, #6
++ vst1.8 {q9-q10}, [r0 :128], r3
++ vadd.i16 q9, q1, q13
++ vadd.i16 q10, q2, q14
++ vadd.i16 q11, q3, q15
++ vmlal.u8 q0, d8, d12
++ vmlal.u8 q1, d9, d12
++ vmlal.u8 q2, d10, d12
++ vmlal.u8 q3, d11, d12
++
++ bne 1b
++
++ vpop {d8-d12}
++
++ vrshrn.u16 d3, q1, #6
++ vrshrn.u16 d2, q0, #6
++ vrshrn.u16 d4, q2, #6
++ vrshrn.u16 d5, q3, #6
++ vst1.8 {q1-q2}, [r0 :128]
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nb_31_0_1_32 + 32
++ vpush {q4-q7}
++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre)
++ add r2, #64
++ vld1.8 {q14-q15}, [ip :128] @ {1,2,3...32}
++T lsl r3, #1
++ vld1.16 {d8[],d9[]}, [r1] @ Top (right)
++ sub ip, #32
++ vmovl.u8 q12, d28
++ mov r1, #32
++ vmovl.u8 q13, d29
++ vld1.8 {q6-q7}, [ip :128] @ {31,30,29...0}
++ vmovl.u8 q14, d30
++ vmovl.u8 q15, d31
++ vld1.16 {d10[],d11[]}, [r2] @ Left (lower)
++ sub r2, #64
++ vshl.i16 q8, q0, #5
++ vshl.i16 q9, q1, #5
++ vshl.i16 q10, q2, #5
++ vshl.i16 q11, q3, #5
++ vmla.i16 q8, q12, q4
++ vsub.i16 q0, q5, q0
++ vmla.i16 q9, q13, q4
++ vsub.i16 q1, q5, q1
++ vmla.i16 q10, q14, q4
++ vmov.u16 ip, d0[0]
++ vsub.i16 q2, q5, q2
++ vmla.i16 q11, q15, q4 @ Acc set up
++ vsub.i16 q3, q5, q3 @ Add set up
++ vadd.i16 q8, q0
++ vadd.i16 q9, q1
++ vadd.i16 q10, q2
++ vadd.i16 q11, q3
++ vmovl.u8 q4, d12
++ vmovl.u8 q5, d13
++ vmovl.u8 q6, d14
++ vmovl.u8 q7, d15
++
++@ u16 31..0 [4] q4-q7
++@ u16 left[y] [4] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q12, q8, q0
++A sub r0, r0, r3, lsl #1
++T sub r0, r3
++1:
++ vld1.16 {d0[0]}, [r2]!
++A add r0, r0, r3, lsl #1
++T add r0, r3
++ vadd.i16 q13, q9, q1
++ subs r1, #2
++ vadd.i16 q14, q10, q2
++ vadd.i16 q15, q11, q3
++ vmla.i16 q8, q4, d0[0]
++ vmla.i16 q9, q5, d0[0]
++ vmla.i16 q10, q6, d0[0]
++ vmla.i16 q11, q7, d0[0]
++ vmov.16 d0[0], ip
++ vrshr.u16 q8, #6
++ vrshr.u16 q9, #6
++ vrshr.u16 q10, #6
++ vrshr.u16 q11, #6
++ vstm r0, {q8-q11}
++ vadd.i16 q8, q12, q0
++A add r0, r0, r3, lsl #1
++T add r0, r3
++ vld1.16 {d0[0]}, [r2]!
++ vadd.i16 q9, q13, q1
++ vadd.i16 q10, q14, q2
++ vadd.i16 q11, q15, q3
++ vmla.i16 q12, q4, d0[0]
++ vmla.i16 q13, q5, d0[0]
++ vmla.i16 q14, q6, d0[0]
++ vmla.i16 q15, q7, d0[0]
++ vmov.16 d0[0], ip
++ vrshr.u16 q12, #6
++ vrshr.u16 q13, #6
++ vrshr.u16 q14, #6
++ vrshr.u16 q15, #6
++ vstm r0, {q12-q15}
++ vadd.i16 q12, q8, q0
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
++
++ vld1.8 {q0}, [r1] @ Top
++ adr ip, nbx2_3_0_1_4
++ vldr d2, [r2, #8] @ Left (lower)
++ mov r1, #4
++ vldr d3, [ip, #8] @ {1,1,2,2,3,3,4,4}
++ lsl r3, #1
++ vshll.u8 q2, d0, #2
++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4,t4,t4,t4,t4}
++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4,l4,l4,l4,l4}
++ vldr d6, [r2] @ Left (upper)
++ vmlal.u8 q2, d3, d1
++ vsubl.u8 q0, d2, d0
++ vldr d7, [ip] @ {3,3,2,2,1,1,0,0}
++
++@ u8 3..0 [1] d7
++@ u8 left[y] [1] d6
++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1]
++
++ vdup.16 d2, d6[0]
++ vadd.i16 q2, q0
++ vdup.16 d3, d6[1]
++ vadd.i16 q8, q2, q0
++1:
++ vmlal.u8 q2, d7, d2
++ subs r1, #2
++ vadd.i16 q9, q8, q0
++ vmlal.u8 q8, d7, d3
++ vdup.16 d2, d6[2]
++ vdup.16 d3, d6[3]
++ vrshrn.i16 d20, q2, #3
++ vmov q2, q9
++ vst1.8 {d20}, [r0], r3
++ vrshrn.i16 d20, q8, #3
++ vadd.i16 q8, q2, q0
++ vst1.8 {d20}, [r0], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
++
++ adr ip, nbx2_3_0_1_4
++ vld1.16 {q0}, [r1 :128]! @ Top (left)
++ lsl r3, #2
++ vld1.16 {q1}, [ip :128] @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
++ add ip, r2, #16
++ vld1.32 {d4[],d5[]}, [r1] @ Top (right)
++ vshl.s16 q3, q0, #2
++ vmovl.u8 q8, d3 @ {1,1,2,2,3,3,4,4}
++ vld1.32 {d18[],d19[]}, [ip] @ Left (lower)
++ vmla.i16 q3, q8, q2 @ Acc set up
++ vsub.i16 q0, q9, q0 @ Add set up
++ vmovl.u8 q1, d2 @ {3,3,2,2,1,1,0,0}
++ vadd.i16 q2, q3, q0
++
++@ u16 3..0 [1] q1
++@ u32 left[y] [1] [r2]
++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1]
++
++ vld1.32 {d6[],d7[]}, [r2]!
++ vadd.i16 q8, q2, q0
++ vld1.32 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++
++ vrshr.u16 q9, q2, #3
++ vmov q2, q3
++ vrshr.u16 q10, q8, #3
++ vld1.32 {d6[],d7[]}, [r2]!
++ vst1.16 {q9}, [r0 :128], r3
++ vadd.i16 q8, q2, q0
++ vld1.32 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++ vst1.16 {q10}, [r0 :128], r3
++
++ vrshr.u16 q9, q2, #3
++ add r3, r0
++ vrshr.u16 q10, q8, #3
++ vst1.16 {q9}, [r0 :128]
++ vst1.16 {q10}, [r3 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
++
++ adr ip, nbx2_7_0_1_8 + 16
++ vld1.8 {q0}, [r1 :128]! @ Top (left)
++ add r2, #16
++ vld1.8 {q1}, [ip: 128] @ {1,1,2,2,3,3...8,8}
++ lsl r3, #1
++ vld1.16 {d4[]}, [r1] @ Top (right)
++ sub ip, #16
++ vshll.u8 q3, d0, #3
++ mov r1, #8
++ vshll.u8 q8, d1, #3
++ vld1.16 {d5[]}, [r2] @ Left (lower)
++ sub r2, #16
++ vmlal.u8 q3, d2, d4
++ vmlal.u8 q8, d3, d4 @ Acc set up
++ vsubl.u8 q1, d5, d0
++ vsubl.u8 q0, d5, d1 @ Add set up
++ vld1.8 {q2}, [ip :128] @ {7,7,6,6,5,5...0,0}
++
++@ u8 7..0 [1] q2
++@ u8 left[y] [1] [r2]
++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q3, q1
++ vadd.i16 q8, q0
++1:
++ vadd.i16 q10, q3, q1
++ subs r1, #2
++ vld1.16 {d18[]}, [r2]!
++ vadd.i16 q11, q8, q0
++ vld1.16 {d19[]}, [r2]!
++ vmlal.u8 q3, d4, d18
++ vmlal.u8 q8, d5, d18
++ vadd.i16 q12, q10, q1
++ vmlal.u8 q10, d4, d19
++ vadd.i16 q13, q11, q0
++ vmlal.u8 q11, d5, d19
++ vrshrn.u16 d18, q3, #4
++ vrshrn.u16 d19, q8, #4
++ vmov q3, q12
++ vst1.8 {q9}, [r0 :128], r3
++ vrshrn.u16 d18, q10, #4
++ vrshrn.u16 d19, q11, #4
++ vmov q8, q13
++ vst1.8 {q9}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++ .balign 64
++
++nbx2_15_0_1_16:
++ .byte 15, 15, 14, 14, 13, 13, 12, 12
++ .byte 11, 11, 10, 10, 9, 9, 8, 8
++nbx2_7_0_1_8:
++ .byte 7, 7, 6, 6, 5, 5, 4, 4
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++ .byte 5, 5, 6, 6, 7, 7, 8, 8
++ .byte 9, 9, 10, 10, 11, 11, 12, 12
++ .byte 13, 13, 14, 14, 15, 15, 16, 16
++
++ @ should be back on a 64-byte boundary here
++
++nbx2_3_0_1_4:
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr ip, nbx2_7_0_1_8 + 16
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ add r2, #32
++ vld1.8 {q2}, [ip :128] @ {1,1,2,2,3,3...8,8}
++ lsl r3, #2
++ vld1.32 {d6[],d7[]}, [r1] @ Top (right)
++ sub ip, #16
++ vmovl.u8 q8, d4
++ mov r1, #8
++ vshl.i16 q9, q0, #3
++ vmovl.u8 q2, d5
++ vshl.i16 q10, q1, #3
++ vld1.32 {d22[],d23[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vld1.8 {q12}, [ip] @ {7,7,6,6,5,5...0,0}
++ vmla.i16 q9, q8, q3
++ vmla.i16 q10, q2, q3 @ Acc set up
++ vsub.i16 q0, q11, q0
++ vsub.i16 q1, q11, q1 @ Add set up
++ vadd.i16 q2, q9, q0
++ vadd.i16 q3, q10, q1
++ vmovl.u8 q8, d24
++ vmovl.u8 q9, d25
++
++@ u16 7..0 [2] q8,q9
++@ u32 left[y] [2] [r2]
++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++ vadd.i16 q10, q2, q0
++ subs r1, #2
++ vld1.32 {d24[],d25[]}, [r2]!
++ vadd.i16 q11, q3, q1
++ vld1.32 {d28[],d29[]}, [r2]!
++ vmla.i16 q2, q8, q12
++ vmla.i16 q3, q9, q12
++ vadd.i16 q12, q10, q0
++ vmla.i16 q10, q8, q14
++ vadd.i16 q13, q11, q1
++ vmla.i16 q11, q9, q14
++ vrshr.u16 q14, q2, #4
++ vrshr.u16 q15, q3, #4
++ vmov q2, q12
++ vst1.16 {q14-q15}, [r0 :128], r3
++ vrshr.u16 q14, q10, #4
++ vrshr.u16 q15, q11, #4
++ vmov q3, q13
++ vst1.16 {q14-q15}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
++
++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nbx2_15_0_1_16 + 32
++ vpush {d8-d12}
++ vld1.8 {q2-q3}, [ip :128] @ {1,1,2,2,3,3...16,16}
++ add r2, #32
++ vld1.16 {d8[]}, [r1] @ Top (right)
++ sub ip, #32
++ vshll.u8 q8, d0, #4
++ mov r1, #16
++ vld1.16 {d9[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vshll.u8 q9, d1, #4
++ lsl r3, #1
++ vshll.u8 q10, d2, #4
++ vshll.u8 q11, d3, #4
++ vmlal.u8 q8, d4, d8
++ vsubl.u8 q12, d9, d0
++ vmlal.u8 q9, d5, d8
++ vsubl.u8 q13, d9, d1
++ vmlal.u8 q10, d6, d8
++ vsubl.u8 q14, d9, d2
++ vmlal.u8 q11, d7, d8 @ Acc set up
++ vsubl.u8 q15, d9, d3 @ Add set up
++ vadd.i16 q8, q12
++ vadd.i16 q9, q13
++ vadd.i16 q10, q14
++ vadd.i16 q11, q15
++ vld1.8 {q4-q5}, [ip :128] @ {15,15,14,14,13,13...0,0}
++
++@ u8 15..0 [2] q4,q5
++@ u8 left[y] [2] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1]
++
++ vld1.16 {d12[]}, [r2]!
++ vadd.i16 q0, q8, q12
++ b 2f
++1:
++ vld1.16 {d12[]}, [r2]!
++ vrshrn.u16 d3, q1, #5
++ vrshrn.u16 d2, q0, #5
++ vadd.i16 q0, q8, q12
++ vrshrn.u16 d4, q2, #5
++ vrshrn.u16 d5, q3, #5
++ vst1.8 {q1-q2}, [r0 :128], r3
++2: vadd.i16 q1, q9, q13
++ subs r1, #2
++ vadd.i16 q2, q10, q14
++ vadd.i16 q3, q11, q15
++ vmlal.u8 q8, d8, d12
++ vmlal.u8 q9, d9, d12
++ vmlal.u8 q10, d10, d12
++ vmlal.u8 q11, d11, d12
++ vld1.16 {d12[]}, [r2]!
++ vrshrn.u16 d19, q9, #5
++ vrshrn.u16 d18, q8, #5
++ vadd.i16 q8, q0, q12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vst1.8 {q9-q10}, [r0 :128], r3
++ vadd.i16 q9, q1, q13
++ vadd.i16 q10, q2, q14
++ vadd.i16 q11, q3, q15
++ vmlal.u8 q0, d8, d12
++ vmlal.u8 q1, d9, d12
++ vmlal.u8 q2, d10, d12
++ vmlal.u8 q3, d11, d12
++
++ bne 1b
++
++ vpop {d8-d12}
++
++ vrshrn.u16 d3, q1, #5
++ vrshrn.u16 d2, q0, #5
++ vrshrn.u16 d4, q2, #5
++ vrshrn.u16 d5, q3, #5
++ vst1.8 {q1-q2}, [r0 :128]
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nbx2_15_0_1_16 + 32
++ vpush {q4-q7}
++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre)
++ add r2, #64
++ vld1.8 {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
++T lsl r3, #2
++ vld1.32 {d8[],d9[]}, [r1] @ Top (right)
++ sub ip, #32
++ vmovl.u8 q12, d28
++ mov r1, #16
++ vmovl.u8 q13, d29
++ vld1.8 {q6-q7}, [ip :128] @ {15,15,14,14,13,13...0,0}
++ vmovl.u8 q14, d30
++ vmovl.u8 q15, d31
++ vld1.32 {d10[],d11[]}, [r2] @ Left (lower)
++ sub r2, #64
++ vshl.i16 q8, q0, #4
++ vshl.i16 q9, q1, #4
++ vshl.i16 q10, q2, #4
++ vshl.i16 q11, q3, #4
++ vmla.i16 q8, q12, q4
++ vsub.i16 q0, q5, q0
++ vmla.i16 q9, q13, q4
++ vpush {q0}
++ vsub.i16 q1, q5, q1
++ vmla.i16 q10, q14, q4
++ vsub.i16 q2, q5, q2
++ vmla.i16 q11, q15, q4 @ Acc set up
++ vsub.i16 q3, q5, q3 @ Add set up
++ vadd.i16 q8, q0
++ vadd.i16 q9, q1
++ vadd.i16 q10, q2
++ vadd.i16 q11, q3
++ vmovl.u8 q4, d12
++ vmovl.u8 q5, d13
++ vmovl.u8 q6, d14
++ vmovl.u8 q7, d15
++
++@ u16 31..0 [4] q4-q7
++@ u16 left[y] [4] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q12, q8, q0
++A sub r0, r0, r3, lsl #2
++T sub r0, r3
++1:
++ vld1.32 {d0[],d1[]}, [r2]!
++A add r0, r0, r3, lsl #2
++T add r0, r3
++ vadd.i16 q13, q9, q1
++ subs r1, #2
++ vadd.i16 q14, q10, q2
++ vadd.i16 q15, q11, q3
++ vmla.i16 q8, q4, q0
++ vmla.i16 q9, q5, q0
++ vmla.i16 q10, q6, q0
++ vmla.i16 q11, q7, q0
++ vld1.16 {q0}, [sp]
++ vrshr.u16 q8, #5
++ vrshr.u16 q9, #5
++ vrshr.u16 q10, #5
++ vrshr.u16 q11, #5
++ vstm r0, {q8-q11}
++ vadd.i16 q8, q12, q0
++A add r0, r0, r3, lsl #2
++T add r0, r3
++ vld1.32 {d0[],d1[]}, [r2]!
++ vadd.i16 q9, q13, q1
++ vadd.i16 q10, q14, q2
++ vadd.i16 q11, q15, q3
++ vmla.i16 q12, q4, q0
++ vmla.i16 q13, q5, q0
++ vmla.i16 q14, q6, q0
++ vmla.i16 q15, q7, q0
++ vld1.16 {q0}, [sp]
++ vrshr.u16 q12, #5
++ vrshr.u16 q13, #5
++ vrshr.u16 q14, #5
++ vrshr.u16 q15, #5
++ vstm r0, {q12-q15}
++ vadd.i16 q12, q8, q0
++ bne 1b
++
++ vpop {q3-q7}
++ bx lr
++
++endfunc
+--- a/libavcodec/arm/vc1dsp_init_neon.c
++++ b/libavcodec/arm/vc1dsp_init_neon.c
+@@ -19,6 +19,7 @@
+ #include <stdint.h>
+
+ #include "libavutil/attributes.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ #include "vc1dsp.h"
+
+@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_
+ void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++
+ void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int rnd);
+
+@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++ /* Dealing with starting and stopping, and removing escape bytes, are
++ * comparatively less time-sensitive, so are more clearly expressed using
++ * a C wrapper around the assembly inner loop. Note that we assume a
++ * little-endian machine that supports unaligned loads. */
++ int dsize = 0;
++ while (size >= 4)
++ {
++ int found = 0;
++ while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ if (!found)
++ {
++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++ dst += skip;
++ src += skip;
++ size -= skip;
++ dsize += skip;
++ while (!found && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ }
++ if (found)
++ {
++ *dst++ = *src++;
++ *dst++ = *src++;
++ ++src;
++ size -= 3;
++ dsize += 2;
++ }
++ }
++ while (size > 0)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ return dsize;
++}
++
+ #define FN_ASSIGN(X, Y) \
+ dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+ dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+
++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon;
++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon;
++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon;
++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon;
++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+ dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
+ FN_ASSIGN(1, 0);
+ FN_ASSIGN(2, 0);
+@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+--- a/libavcodec/arm/vc1dsp_neon.S
++++ b/libavcodec/arm/vc1dsp_neon.S
+@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, e
+ vst1.32 {d1[1]}, [r0,:32]
+ bx lr
+ endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of lower block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++ sub r3, r0, r1, lsl #2
++ vldr d0, .Lcoeffs
++ vld1.32 {d1[0]}, [r0], r1 @ P5
++ vld1.32 {d2[0]}, [r3], r1 @ P1
++ vld1.32 {d3[0]}, [r3], r1 @ P2
++ vld1.32 {d4[0]}, [r0], r1 @ P6
++ vld1.32 {d5[0]}, [r3], r1 @ P3
++ vld1.32 {d6[0]}, [r0], r1 @ P7
++ vld1.32 {d7[0]}, [r3] @ P4
++ vld1.32 {d16[0]}, [r0] @ P8
++ vshll.u8 q9, d1, #1 @ 2*P5
++ vdup.16 d17, r2 @ pq
++ vshll.u8 q10, d2, #1 @ 2*P1
++ vmovl.u8 q11, d3 @ P2
++ vmovl.u8 q1, d4 @ P6
++ vmovl.u8 q12, d5 @ P3
++ vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2
++ vmovl.u8 q11, d6 @ P7
++ vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6
++ vshll.u8 q2, d5, #1 @ 2*P3
++ vmovl.u8 q3, d7 @ P4
++ vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7
++ vmovl.u8 q11, d16 @ P8
++ vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3
++ vmovl.u8 q12, d1 @ P5
++ vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4
++ vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8
++ vsub.i16 d1, d6, d24 @ P4-P5
++ vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4
++ vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5
++ vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vabs.s16 d2, d1
++ vrshr.s16 d3, d18, #3
++ vrshr.s16 d5, d20, #3
++ vshr.s16 d2, d2, #1 @ clip
++ vrshr.s16 d4, d4, #3
++ vabs.s16 d3, d3 @ a2
++ vshr.s16 d1, d1, #8 @ clip_sign
++ vabs.s16 d5, d5 @ a1
++ vceq.i16 d7, d2, #0 @ test clip == 0
++ vabs.s16 d16, d4 @ a0
++ vshr.s16 d4, d4, #8 @ a0_sign
++ vcge.s16 d18, d5, d3 @ test a1 >= a2
++ vcge.s16 d17, d16, d17 @ test a0 >= pq
++ vbsl d18, d3, d5 @ a3
++ vsub.i16 d1, d1, d4 @ clip_sign - a0_sign
++ vorr d3, d7, d17 @ test clip == 0 || a0 >= pq
++ vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 d5, d18, d16 @ test a3 >= a0
++ vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vmov.32 r0, d4[1] @ move to gp reg
++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vcge.s16 d4, d0, d2
++ tst r0, #1
++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered
++ vbsl d4, d2, d0 @ FFMIN(d, clip)
++ vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vqmovun.s16 d0, q3
++ vqmovun.s16 d1, q12
++ vst1.32 {d0[0]}, [r3], r1
++ vst1.32 {d1[0]}, [r3]
++1: bx lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of right block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++ sub r3, r0, #4 @ where to start reading
++ vldr d0, .Lcoeffs
++ vld1.32 {d2}, [r3], r1
++ sub r0, r0, #1 @ where to start writing
++ vld1.32 {d4}, [r3], r1
++ vld1.32 {d3}, [r3], r1
++ vld1.32 {d5}, [r3]
++ vdup.16 d1, r2 @ pq
++ vtrn.8 q1, q2
++ vtrn.16 d2, d3 @ P1, P5, P3, P7
++ vtrn.16 d4, d5 @ P2, P6, P4, P8
++ vshll.u8 q3, d2, #1 @ 2*P1, 2*P5
++ vmovl.u8 q8, d4 @ P2, P6
++ vmovl.u8 q9, d3 @ P3, P7
++ vmovl.u8 q2, d5 @ P4, P8
++ vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6
++ vshll.u8 q10, d3, #1 @ 2*P3, 2*P7
++ vmovl.u8 q1, d2 @ P1, P5
++ vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++ vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++ vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later
++ vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4
++ vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5
++ vsub.i16 d3, d4, d2 @ P4-P5
++ vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vrshr.s16 q3, q3, #3
++ vabs.s16 d5, d3
++ vshr.s16 d3, d3, #8 @ clip_sign
++ vrshr.s16 d16, d20, #3
++ vabs.s16 q3, q3 @ a1, a2
++ vshr.s16 d5, d5, #1 @ clip
++ vabs.s16 d17, d16 @ a0
++ vceq.i16 d18, d5, #0 @ test clip == 0
++ vshr.s16 d16, d16, #8 @ a0_sign
++ vcge.s16 d19, d6, d7 @ test a1 >= a2
++ vcge.s16 d1, d17, d1 @ test a0 >= pq
++ vsub.i16 d16, d3, d16 @ clip_sign - a0_sign
++ vbsl d19, d7, d6 @ a3
++ vorr d1, d18, d1 @ test clip == 0 || a0 >= pq
++ vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 d6, d19, d17 @ test a3 >= a0 @
++ vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vmov.32 r2, d3[1] @ move to gp reg
++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vcge.s16 d3, d0, d5
++ tst r2, #1
++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered
++ vbsl d3, d5, d0 @ FFMIN(d, clip)
++ vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vqmovun.s16 d1, q1
++ vqmovun.s16 d0, q2
++ vst2.8 {d0[0], d1[0]}, [r0], r1
++ vst2.8 {d0[1], d1[1]}, [r0], r1
++ vst2.8 {d0[2], d1[2]}, [r0], r1
++ vst2.8 {d0[3], d1[3]}, [r0]
++1: bx lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of lower block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++ sub r3, r0, r1, lsl #2
++ vldr d0, .Lcoeffs
++ vld1.32 {d1}, [r0 :64], r1 @ P5
++ vld1.32 {d2}, [r3 :64], r1 @ P1
++ vld1.32 {d3}, [r3 :64], r1 @ P2
++ vld1.32 {d4}, [r0 :64], r1 @ P6
++ vld1.32 {d5}, [r3 :64], r1 @ P3
++ vld1.32 {d6}, [r0 :64], r1 @ P7
++ vshll.u8 q8, d1, #1 @ 2*P5
++ vshll.u8 q9, d2, #1 @ 2*P1
++ vld1.32 {d7}, [r3 :64] @ P4
++ vmovl.u8 q1, d3 @ P2
++ vld1.32 {d20}, [r0 :64] @ P8
++ vmovl.u8 q11, d4 @ P6
++ vdup.16 q12, r2 @ pq
++ vmovl.u8 q13, d5 @ P3
++ vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2
++ vmovl.u8 q1, d6 @ P7
++ vshll.u8 q2, d5, #1 @ 2*P3
++ vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6
++ vmovl.u8 q3, d7 @ P4
++ vmovl.u8 q10, d20 @ P8
++ vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7
++ vmovl.u8 q1, d1 @ P5
++ vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3
++ vsub.i16 q13, q3, q1 @ P4-P5
++ vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4
++ vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8
++ vabs.s16 q10, q13
++ vshr.s16 q13, q13, #8 @ clip_sign
++ vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4
++ vshr.s16 q10, q10, #1 @ clip
++ vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5
++ vrshr.s16 q8, q8, #3
++ vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vceq.i16 q11, q10, #0 @ test clip == 0
++ vrshr.s16 q9, q9, #3
++ vabs.s16 q8, q8 @ a2
++ vabs.s16 q9, q9 @ a1
++ vrshr.s16 q2, q2, #3
++ vcge.s16 q14, q9, q8 @ test a1 >= a2
++ vabs.s16 q15, q2 @ a0
++ vshr.s16 q2, q2, #8 @ a0_sign
++ vbsl q14, q8, q9 @ a3
++ vcge.s16 q8, q15, q12 @ test a0 >= pq
++ vsub.i16 q2, q13, q2 @ clip_sign - a0_sign
++ vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q12, q14, q15 @ test a3 >= a0
++ vorr q8, q11, q8 @ test clip == 0 || a0 >= pq
++ vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vshl.i64 q11, q9, #16
++ vmov.32 r0, d18[1] @ move to gp reg
++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vmov.32 r2, d19[1]
++ vshr.s64 q9, q11, #48
++ vcge.s16 q11, q0, q10
++ vorr q8, q8, q9
++ and r0, r0, r2
++ vbsl q11, q10, q0 @ FFMIN(d, clip)
++ tst r0, #1
++ bne 1f @ none of the 8 pixel pairs should be updated in this case
++ vbic q0, q11, q8 @ set each d to zero if it should not be filtered
++ vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vqmovun.s16 d0, q3
++ vqmovun.s16 d1, q1
++ vst1.32 {d0}, [r3 :64], r1
++ vst1.32 {d1}, [r3 :64]
++1: bx lr
++endfunc
++
++.align 5
++.Lcoeffs:
++.quad 0x00050002
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of right block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++ push {lr}
++ sub r3, r0, #4 @ where to start reading
++ vldr d0, .Lcoeffs
++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]...
++ sub r0, r0, #1 @ where to start writing
++ vld1.32 {d4}, [r3], r1
++ add r12, r0, r1, lsl #2
++ vld1.32 {d3}, [r3], r1
++ vld1.32 {d5}, [r3], r1
++ vld1.32 {d6}, [r3], r1
++ vld1.32 {d16}, [r3], r1
++ vld1.32 {d7}, [r3], r1
++ vld1.32 {d17}, [r3]
++ vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
++ vdup.16 q9, r2 @ pq
++ vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++ vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++ vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
++ vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
++ vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++ vtrn.32 d2, d6 @ P1, P5
++ vtrn.32 d4, d16 @ P2, P6
++ vtrn.32 d3, d7 @ P3, P7
++ vtrn.32 d5, d17 @ P4, P8
++ vshll.u8 q10, d2, #1 @ 2*P1
++ vshll.u8 q11, d6, #1 @ 2*P5
++ vmovl.u8 q12, d4 @ P2
++ vmovl.u8 q13, d16 @ P6
++ vmovl.u8 q14, d3 @ P3
++ vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2
++ vmovl.u8 q12, d7 @ P7
++ vshll.u8 q1, d3, #1 @ 2*P3
++ vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6
++ vmovl.u8 q2, d5 @ P4
++ vmovl.u8 q8, d17 @ P8
++ vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7
++ vmovl.u8 q3, d6 @ P5
++ vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3
++ vsub.i16 q12, q2, q3 @ P4-P5
++ vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4
++ vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8
++ vabs.s16 q8, q12
++ vshr.s16 q12, q12, #8 @ clip_sign
++ vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4
++ vshr.s16 q8, q8, #1 @ clip
++ vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5
++ vrshr.s16 q11, q11, #3
++ vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vceq.i16 q13, q8, #0 @ test clip == 0
++ vrshr.s16 q10, q10, #3
++ vabs.s16 q11, q11 @ a2
++ vabs.s16 q10, q10 @ a1
++ vrshr.s16 q1, q1, #3
++ vcge.s16 q14, q10, q11 @ test a1 >= a2
++ vabs.s16 q15, q1 @ a0
++ vshr.s16 q1, q1, #8 @ a0_sign
++ vbsl q14, q11, q10 @ a3
++ vcge.s16 q9, q15, q9 @ test a0 >= pq
++ vsub.i16 q1, q12, q1 @ clip_sign - a0_sign
++ vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q11, q14, q15 @ test a3 >= a0
++ vorr q9, q13, q9 @ test clip == 0 || a0 >= pq
++ vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vmov.32 r2, d20[1] @ move to gp reg
++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vmov.32 r3, d21[1]
++ vcge.s16 q10, q0, q8
++ and r14, r2, r3
++ vbsl q10, q8, q0 @ FFMIN(d, clip)
++ tst r14, #1
++ bne 2f @ none of the 8 pixel pairs should be updated in this case
++ vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vqmovun.s16 d1, q3
++ vqmovun.s16 d0, q2
++ tst r2, #1
++ bne 1f @ none of the first 4 pixel pairs should be updated if so
++ vst2.8 {d0[0], d1[0]}, [r0], r1
++ vst2.8 {d0[1], d1[1]}, [r0], r1
++ vst2.8 {d0[2], d1[2]}, [r0], r1
++ vst2.8 {d0[3], d1[3]}, [r0]
++1: tst r3, #1
++ bne 2f @ none of the second 4 pixel pairs should be updated if so
++ vst2.8 {d0[4], d1[4]}, [r12], r1
++ vst2.8 {d0[5], d1[5]}, [r12], r1
++ vst2.8 {d0[6], d1[6]}, [r12], r1
++ vst2.8 {d0[7], d1[7]}, [r12]
++2: pop {pc}
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of lower block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++ vpush {d8-d15}
++ sub r3, r0, r1, lsl #2
++ vldr d0, .Lcoeffs
++ vld1.64 {q1}, [r0 :128], r1 @ P5
++ vld1.64 {q2}, [r3 :128], r1 @ P1
++ vld1.64 {q3}, [r3 :128], r1 @ P2
++ vld1.64 {q4}, [r0 :128], r1 @ P6
++ vld1.64 {q5}, [r3 :128], r1 @ P3
++ vld1.64 {q6}, [r0 :128], r1 @ P7
++ vshll.u8 q7, d2, #1 @ 2*P5[0..7]
++ vshll.u8 q8, d4, #1 @ 2*P1[0..7]
++ vld1.64 {q9}, [r3 :128] @ P4
++ vmovl.u8 q10, d6 @ P2[0..7]
++ vld1.64 {q11}, [r0 :128] @ P8
++ vmovl.u8 q12, d8 @ P6[0..7]
++ vdup.16 q13, r2 @ pq
++ vshll.u8 q2, d5, #1 @ 2*P1[8..15]
++ vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7]
++ vshll.u8 q10, d3, #1 @ 2*P5[8..15]
++ vmovl.u8 q3, d7 @ P2[8..15]
++ vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7]
++ vmovl.u8 q4, d9 @ P6[8..15]
++ vmovl.u8 q14, d10 @ P3[0..7]
++ vmovl.u8 q15, d12 @ P7[0..7]
++ vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15]
++ vshll.u8 q3, d10, #1 @ 2*P3[0..7]
++ vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]
++ vmovl.u8 q6, d13 @ P7[8..15]
++ vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ vmovl.u8 q14, d18 @ P4[0..7]
++ vmovl.u8 q9, d19 @ P4[8..15]
++ vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ vmovl.u8 q15, d11 @ P3[8..15]
++ vshll.u8 q5, d11, #1 @ 2*P3[8..15]
++ vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7]
++ vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ vmovl.u8 q15, d22 @ P8[0..7]
++ vmovl.u8 q11, d23 @ P8[8..15]
++ vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ vmovl.u8 q6, d2 @ P5[0..7]
++ vmovl.u8 q1, d3 @ P5[8..15]
++ vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15]
++ vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7]
++ vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ vrshr.s16 q8, q8, #3
++ vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ vrshr.s16 q7, q7, #3
++ vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ vabs.s16 q11, q15
++ vabs.s16 q8, q8 @ a1[0..7]
++ vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ vshr.s16 q15, q15, #8 @ clip_sign[0..7]
++ vrshr.s16 q2, q2, #3
++ vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ vabs.s16 q7, q7 @ a2[0..7]
++ vrshr.s16 q10, q10, #3
++ vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15]
++ vshr.s16 q11, q11, #1 @ clip[0..7]
++ vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7]
++ vabs.s16 q2, q2 @ a1[8..15]
++ vrshr.s16 q3, q3, #3
++ vabs.s16 q10, q10 @ a2[8..15]
++ vbsl q4, q7, q8 @ a3[0..7]
++ vabs.s16 q7, q12
++ vshr.s16 q8, q12, #8 @ clip_sign[8..15]
++ vrshr.s16 q5, q5, #3
++ vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15]
++ vshr.s16 q7, q7, #1 @ clip[8..15]
++ vbsl q12, q10, q2 @ a3[8..15]
++ vabs.s16 q2, q3 @ a0[0..7]
++ vceq.i16 q10, q11, #0 @ test clip[0..7] == 0
++ vshr.s16 q3, q3, #8 @ a0_sign[0..7]
++ vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7]
++ vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq
++ vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq
++ vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7]
++ vabs.s16 q4, q5 @ a0[8..15]
++ vshr.s16 q5, q5, #8 @ a0_sign[8..15]
++ vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq
++ vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15]
++ vceq.i16 q8, q7, #0 @ test clip[8..15] == 0
++ vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ vmov.32 r0, d4[1] @ move to gp reg
++ vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq
++ vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vmov.32 r2, d5[1]
++ vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15]
++ vshl.i64 q2, q2, #16
++ vcge.s16 q12, q15, q11
++ vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ vshr.s64 q2, q2, #48
++ and r0, r0, r2
++ vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7])
++ vshl.i64 q11, q4, #16
++ vmov.32 r2, d8[1]
++ vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ vorr q2, q10, q2
++ vmov.32 r12, d9[1]
++ vshr.s64 q4, q11, #48
++ vcge.s16 q10, q0, q7
++ vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ vorr q4, q8, q4
++ and r2, r2, r12
++ vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15])
++ vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++ and r0, r0, r2
++ vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ tst r0, #1
++ bne 1f @ none of the 16 pixel pairs should be updated in this case
++ vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++ vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++ vqmovun.s16 d4, q14
++ vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++ vqmovun.s16 d0, q6
++ vqmovun.s16 d5, q9
++ vqmovun.s16 d1, q1
++ vst1.64 {q2}, [r3 :128], r1
++ vst1.64 {q0}, [r3 :128]
++1: vpop {d8-d15}
++ bx lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of right block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++ push {r4-r6,lr}
++ vpush {d8-d15}
++ sub r3, r0, #4 @ where to start reading
++ vldr d0, .Lcoeffs
++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]...
++ sub r0, r0, #1 @ where to start writing
++ vld1.32 {d3}, [r3], r1
++ add r4, r0, r1, lsl #2
++ vld1.32 {d10}, [r3], r1
++ vld1.32 {d11}, [r3], r1
++ vld1.32 {d16}, [r3], r1
++ vld1.32 {d4}, [r3], r1
++ vld1.32 {d8}, [r3], r1
++ vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
++ vld1.32 {d14}, [r3], r1
++ vld1.32 {d5}, [r3], r1
++ vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
++ vld1.32 {d6}, [r3], r1
++ vld1.32 {d12}, [r3], r1
++ vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
++ vld1.32 {d13}, [r3], r1
++ vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++ vld1.32 {d1}, [r3], r1
++ vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
++ vld1.32 {d7}, [r3], r1
++ vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++ vld1.32 {d9}, [r3], r1
++ vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
++ vld1.32 {d15}, [r3]
++ vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
++ vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++ vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
++ vdup.16 q9, r2 @ pq
++ vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
++ vtrn.32 d2, d16 @ P1[0..7], P5[0..7]
++ vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
++ vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
++ vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
++ vtrn.32 d3, d4 @ P2[0..7], P6[0..7]
++ vshll.u8 q10, d2, #1 @ 2*P1[0..7]
++ vtrn.32 d10, d8 @ P3[0..7], P7[0..7]
++ vshll.u8 q11, d16, #1 @ 2*P5[0..7]
++ vtrn.32 d11, d14 @ P4[0..7], P8[0..7]
++ vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
++ vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
++ vmovl.u8 q1, d3 @ P2[0..7]
++ vmovl.u8 q12, d4 @ P6[0..7]
++ vtrn.32 d5, d1 @ P1[8..15], P5[8..15]
++ vtrn.32 d6, d7 @ P2[8..15], P6[8..15]
++ vtrn.32 d12, d9 @ P3[8..15], P7[8..15]
++ vtrn.32 d13, d15 @ P4[8..15], P8[8..15]
++ vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]
++ vmovl.u8 q1, d10 @ P3[0..7]
++ vshll.u8 q2, d5, #1 @ 2*P1[8..15]
++ vshll.u8 q13, d1, #1 @ 2*P5[8..15]
++ vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7]
++ vmovl.u8 q14, d6 @ P2[8..15]
++ vmovl.u8 q3, d7 @ P6[8..15]
++ vmovl.u8 q15, d8 @ P7[0..7]
++ vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ vmovl.u8 q1, d12 @ P3[8..15]
++ vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15]
++ vmovl.u8 q4, d9 @ P7[8..15]
++ vshll.u8 q14, d10, #1 @ 2*P3[0..7]
++ vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15]
++ vmovl.u8 q5, d11 @ P4[0..7]
++ vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ vshll.u8 q15, d12, #1 @ 2*P3[8..15]
++ vmovl.u8 q6, d13 @ P4[8..15]
++ vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ vmovl.u8 q1, d14 @ P8[0..7]
++ vmovl.u8 q7, d15 @ P8[8..15]
++ vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ vmovl.u8 q4, d16 @ P5[0..7]
++ vmovl.u8 q8, d1 @ P5[8..15]
++ vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7]
++ vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15]
++ vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7]
++ vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ vrshr.s16 q10, q10, #3
++ vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15]
++ vrshr.s16 q11, q11, #3
++ vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ vrshr.s16 q2, q2, #3
++ vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ vabs.s16 q10, q10 @ a1[0..7]
++ vrshr.s16 q13, q13, #3
++ vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ vabs.s16 q3, q11 @ a2[0..7]
++ vabs.s16 q2, q2 @ a1[8..15]
++ vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ vabs.s16 q11, q1
++ vabs.s16 q12, q13 @ a2[8..15]
++ vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7]
++ vshr.s16 q1, q1, #8 @ clip_sign[0..7]
++ vrshr.s16 q15, q15, #3
++ vshr.s16 q11, q11, #1 @ clip[0..7]
++ vrshr.s16 q14, q14, #3
++ vbsl q13, q3, q10 @ a3[0..7]
++ vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15]
++ vabs.s16 q10, q15 @ a0[8..15]
++ vshr.s16 q15, q15, #8 @ a0_sign[8..15]
++ vbsl q3, q12, q2 @ a3[8..15]
++ vabs.s16 q2, q14 @ a0[0..7]
++ vabs.s16 q12, q7
++ vshr.s16 q7, q7, #8 @ clip_sign[8..15]
++ vshr.s16 q14, q14, #8 @ a0_sign[0..7]
++ vshr.s16 q12, q12, #1 @ clip[8..15]
++ vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15]
++ vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15]
++ vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq
++ vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq
++ vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7]
++ vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7]
++ vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ vceq.i16 q15, q11, #0 @ test clip[0..7] == 0
++ vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq
++ vceq.i16 q14, q12, #0 @ test clip[8..15] == 0
++ vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq
++ vcge.s16 q14, q13, q12
++ vmov.32 r2, d4[1] @ move to gp reg
++ vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ vmov.32 r3, d5[1]
++ vcge.s16 q2, q0, q11
++ vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15])
++ vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7])
++ vmov.32 r5, d6[1]
++ vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmov.32 r6, d7[1]
++ and r12, r2, r3
++ vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++ vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++ and r14, r5, r6
++ vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++ and r12, r12, r14
++ vqmovun.s16 d4, q6
++ vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++ tst r12, #1
++ bne 4f @ none of the 16 pixel pairs should be updated in this case
++ vqmovun.s16 d2, q5
++ vqmovun.s16 d3, q4
++ vqmovun.s16 d5, q8
++ tst r2, #1
++ bne 1f
++ vst2.8 {d2[0], d3[0]}, [r0], r1
++ vst2.8 {d2[1], d3[1]}, [r0], r1
++ vst2.8 {d2[2], d3[2]}, [r0], r1
++ vst2.8 {d2[3], d3[3]}, [r0]
++1: add r0, r4, r1, lsl #2
++ tst r3, #1
++ bne 2f
++ vst2.8 {d2[4], d3[4]}, [r4], r1
++ vst2.8 {d2[5], d3[5]}, [r4], r1
++ vst2.8 {d2[6], d3[6]}, [r4], r1
++ vst2.8 {d2[7], d3[7]}, [r4]
++2: add r4, r0, r1, lsl #2
++ tst r5, #1
++ bne 3f
++ vst2.8 {d4[0], d5[0]}, [r0], r1
++ vst2.8 {d4[1], d5[1]}, [r0], r1
++ vst2.8 {d4[2], d5[2]}, [r0], r1
++ vst2.8 {d4[3], d5[3]}, [r0]
++3: tst r6, #1
++ bne 4f
++ vst2.8 {d4[4], d5[4]}, [r4], r1
++ vst2.8 {d4[5], d5[5]}, [r4], r1
++ vst2.8 {d4[6], d5[6]}, [r4], r1
++ vst2.8 {d4[7], d5[7]}, [r4]
++4: vpop {d8-d15}
++ pop {r4-r6,pc}
++endfunc
++
++@ Copy at most the specified number of bytes from source to destination buffer,
++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
++@ On entry:
++@ r0 -> source buffer
++@ r1 = max number of bytes to copy
++@ r2 -> destination buffer, optimally 8-byte aligned
++@ On exit:
++@ r0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++ @ Offset by 48 to screen out cases that are too short for us to handle,
++ @ and also make it easy to test for loop termination, or to determine
++ @ whether we need an odd number of half-iterations of the loop.
++ subs r1, r1, #48
++ bmi 90f
++
++ @ Set up useful constants
++ vmov.i32 q0, #0x3000000
++ vmov.i32 q1, #0x30000
++
++ tst r1, #16
++ bne 1f
++
++ vld1.8 {q8, q9}, [r0]!
++ vbic q12, q8, q0
++ vext.8 q13, q8, q9, #1
++ vext.8 q14, q8, q9, #2
++ vext.8 q15, q8, q9, #3
++ veor q12, q12, q1
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ add r1, r1, #16
++ b 3f
++
++1: vld1.8 {q10, q11}, [r0]!
++ vbic q12, q10, q0
++ vext.8 q13, q10, q11, #1
++ vext.8 q14, q10, q11, #2
++ vext.8 q15, q10, q11, #3
++ veor q12, q12, q1
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ @ Drop through...
++2: vmov q8, q11
++ vld1.8 {q9}, [r0]!
++ vorr q13, q12, q13
++ vorr q15, q14, q15
++ vbic q12, q8, q0
++ vorr q3, q13, q15
++ vext.8 q13, q8, q9, #1
++ vext.8 q14, q8, q9, #2
++ vext.8 q15, q8, q9, #3
++ veor q12, q12, q1
++ vorr d6, d6, d7
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ vmov r3, r12, d6
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ orrs r3, r3, r12
++ bne 90f
++ vst1.64 {q10}, [r2]!
++3: vmov q10, q9
++ vld1.8 {q11}, [r0]!
++ vorr q13, q12, q13
++ vorr q15, q14, q15
++ vbic q12, q10, q0
++ vorr q3, q13, q15
++ vext.8 q13, q10, q11, #1
++ vext.8 q14, q10, q11, #2
++ vext.8 q15, q10, q11, #3
++ veor q12, q12, q1
++ vorr d6, d6, d7
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ vmov r3, r12, d6
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ orrs r3, r3, r12
++ bne 91f
++ vst1.64 {q8}, [r2]!
++ subs r1, r1, #32
++ bpl 2b
++
++90: add r0, r1, #48
++ bx lr
++
++91: sub r1, r1, #16
++ b 90b
++endfunc
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -2567,6 +2567,17 @@ typedef struct AVHWAccel {
+ * that avctx->hwaccel_priv_data is invalid.
+ */
+ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++
++ /**
++ * Called if parsing fails
++ *
++ * An error has occured, end_frame will not be called
++ * start_frame & decode_slice may or may not have been called
++ * Optional
++ *
++ * @param avctx the codec context
++ */
++ void (*abort_frame)(AVCodecContext *avctx);
+ } AVHWAccel;
+
+ /**
+--- a/libavcodec/cabac.h
++++ b/libavcodec/cabac.h
+@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_table
+ typedef struct CABACContext{
+ int low;
+ int range;
+- int outstanding_count;
++ union
++ {
++ int outstanding_count;
++ struct {
++ uint16_t bits;
++ uint16_t range;
++ } by22;
++ };
+ const uint8_t *bytestream_start;
+ const uint8_t *bytestream;
+ const uint8_t *bytestream_end;
+--- a/libavcodec/codec.h
++++ b/libavcodec/codec.h
+@@ -350,6 +350,17 @@ const AVCodec *av_codec_iterate(void **o
+ AVCodec *avcodec_find_decoder(enum AVCodecID id);
+
+ /**
++ * Find a registered decoder with a matching codec ID and pix_fmt.
++ * A decoder will pix_fmt set to NULL will match any fmt.
++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
++ *
++ * @param id AVCodecID of the requested decoder
++ * @param fmt AVPixelForma that msut be supported by decoder
++ * @return A decoder if one was found, NULL otherwise.
++ */
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
++
++/**
+ * Find a registered decoder with the specified name.
+ *
+ * @param name name of the requested decoder
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v1.h
+@@ -0,0 +1,229 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++
++struct v4l2_ctrl_hevc_pps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ __u8 num_extra_slice_header_bits;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++
++ __u8 padding[4];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 rps;
++ __u8 field_pic;
++ __u16 pic_order_cnt[2];
++ __u8 padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 padding[6];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_bit_offset;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __u16 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 num_active_dpb_entries;
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ __u8 num_rps_poc_st_curr_before;
++ __u8 num_rps_poc_st_curr_after;
++ __u8 num_rps_poc_lt_curr;
++
++ __u8 padding;
++
++ __u32 entry_point_offset_minus1[256];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u64 flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v2.h
+@@ -0,0 +1,257 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ __u8 num_extra_slice_header_bits;
++ __u8 num_ref_idx_l0_default_active_minus1;
++ __u8 num_ref_idx_l1_default_active_minus1;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++
++ __u8 padding[4];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 rps;
++ __u8 field_pic;
++ __u16 pic_order_cnt[2];
++ __u8 padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 padding[6];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_bit_offset;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __u16 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ __u8 padding[5];
++
++ __u32 entry_point_offset_minus1[256];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++ __s32 pic_order_cnt_val;
++ __u8 num_active_dpb_entries;
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 num_poc_st_curr_before;
++ __u8 num_poc_st_curr_after;
++ __u8 num_poc_lt_curr;
++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u64 flags;
++};
++
++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v3.h
+@@ -0,0 +1,255 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ __u8 num_extra_slice_header_bits;
++ __u8 num_ref_idx_l0_default_active_minus1;
++ __u8 num_ref_idx_l1_default_active_minus1;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++
++ __u8 padding[4];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 flags;
++ __u8 field_pic;
++ __u16 pic_order_cnt[2];
++ __u8 padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 padding[6];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_bit_offset;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __u16 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ __u8 padding[5];
++
++ __u32 entry_point_offset_minus1[256];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++ __s32 pic_order_cnt_val;
++ __u8 num_active_dpb_entries;
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 num_poc_st_curr_before;
++ __u8 num_poc_st_curr_after;
++ __u8 num_poc_lt_curr;
++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u64 flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v4.h
+@@ -0,0 +1,515 @@
++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
++/*
++ * Video for Linux Two controls header file
++ *
++ * Copyright (C) 1999-2012 the contributors
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * Alternatively you can redistribute this file under the terms of the
++ * BSD license as stated below:
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in
++ * the documentation and/or other materials provided with the
++ * distribution.
++ * 3. The names of its contributors may not be used to endorse or promote
++ * products derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ * The contents of this header was split off from videodev2.h. All control
++ * definitions should be added to this header, which is included by
++ * videodev2.h.
++ */
++
++#ifndef AVCODEC_HEVC_CTRLS_V4_H
++#define AVCODEC_HEVC_CTRLS_V4_H
++
++#include <linux/const.h>
++#include <linux/types.h>
++
++#define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400)
++#define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401)
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402)
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_STATELESS_BASE + 403)
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 404)
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405)
++#define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406)
++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
++
++enum v4l2_stateless_hevc_decode_mode {
++ V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_stateless_hevc_start_code {
++ V4L2_STATELESS_HEVC_START_CODE_NONE,
++ V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/**
++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
++ *
++ * @video_parameter_set_id: specifies the value of the
++ * vps_video_parameter_set_id of the active VPS
++ * @seq_parameter_set_id: provides an identifier for the SPS for
++ * reference by other syntax elements
++ * @pic_width_in_luma_samples: specifies the width of each decoded picture
++ * in units of luma samples
++ * @pic_height_in_luma_samples: specifies the height of each decoded picture
++ * in units of luma samples
++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
++ * samples of the luma array
++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
++ * samples of the chroma arrays
++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
++ * the variable MaxPicOrderCntLsb
++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
++ * required size of the decoded picture
++ * buffer for the codec video sequence
++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
++ * value of SpsMaxLatencyPictures array
++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
++ * luma coding block size
++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
++ * the maximum and minimum luma
++ * coding block size
++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
++ * transform block size
++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
++ * the maximum and minimum luma
++ * transform block size
++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
++ * depth for transform units of
++ * coding units coded in inter
++ * prediction mode
++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
++ * depth for transform units of
++ * coding units coded in intra
++ * prediction mode
++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
++ * bits used to represent each of PCM sample
++ * values of the luma component
++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
++ * of bits used to represent each of PCM
++ * sample values of the chroma components
++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
++ * minimum size of coding blocks
++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
++ * the maximum and minimum size of
++ * coding blocks
++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
++ * syntax structures included in the SPS
++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
++ * reference pictures that are specified in the SPS
++ * @chroma_format_idc: specifies the chroma sampling
++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
++ * of temporal sub-layers
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_sps {
++ __u8 video_parameter_set_id;
++ __u8 seq_parameter_set_id;
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u8 reserved[6];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20)
++
++/**
++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
++ *
++ * @pic_parameter_set_id: identifies the PPS for reference by other
++ * syntax elements
++ * @num_extra_slice_header_bits: specifies the number of extra slice header
++ * bits that are present in the slice header RBSP
++ * for coded pictures referring to the PPS.
++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
++ * inferred value of num_ref_idx_l0_active_minus1
++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
++ * inferred value of num_ref_idx_l1_active_minus1
++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
++ * each slice referring to the PPS
++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
++ * tree block size and the minimum luma coding block
++ * size of coding units that convey cu_qp_delta_abs
++ * and cu_qp_delta_sign_flag
++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
++ * partitioning the picture
++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
++ * the picture
++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
++ * units of coding tree blocks
++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
++ * units of coding tree blocks
++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
++ * beta divided by 2
++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
++ * divided by 2
++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
++ * the variable Log2ParMrgLevel
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_PPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_pps {
++ __u8 pic_parameter_set_id;
++ __u8 num_extra_slice_header_bits;
++ __u8 num_ref_idx_l0_default_active_minus1;
++ __u8 num_ref_idx_l1_default_active_minus1;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++ __u8 reserved;
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01
++
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME 0
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD 1
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD 2
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM 3
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP 4
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP 5
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM 6
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING 7
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING 8
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM 9
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP 10
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM 11
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP 12
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++/**
++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
++ *
++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
++ * @flags: long term flag for the reference frame
++ * @field_pic: whether the reference is a field picture or a frame.
++ * @reserved: padding field. Should be zeroed by applications.
++ * @pic_order_cnt_val: the picture order count of the current picture.
++ */
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 flags;
++ __u8 field_pic;
++ __u16 reserved;
++ __s32 pic_order_cnt_val;
++};
++
++/**
++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
++ *
++ * @delta_luma_weight_l0: the difference of the weighting factor applied
++ * to the luma prediction value for list 0
++ * @luma_offset_l0: the additive offset applied to the luma prediction value
++ * for list 0
++ * @delta_chroma_weight_l0: the difference of the weighting factor applied
++ * to the chroma prediction values for list 0
++ * @chroma_offset_l0: the difference of the additive offset applied to
++ * the chroma prediction values for list 0
++ * @delta_luma_weight_l1: the difference of the weighting factor applied
++ * to the luma prediction value for list 1
++ * @luma_offset_l1: the additive offset applied to the luma prediction value
++ * for list 1
++ * @delta_chroma_weight_l1: the difference of the weighting factor applied
++ * to the chroma prediction values for list 1
++ * @chroma_offset_l1: the difference of the additive offset applied to
++ * the chroma prediction values for list 1
++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
++ * all luma weighting factors
++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
++ * of the denominator for all chroma
++ * weighting factors
++ */
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++/**
++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
++ *
++ * This control is a dynamically sized 1-dimensional array,
++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
++ *
++ * @bit_size: size (in bits) of the current slice data
++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
++ * @num_entry_point_offsets: specifies the number of entry point offset syntax
++ * elements in the slice header.
++ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
++ * @colour_plane_id: specifies the colour plane associated with the current slice
++ * @slice_pic_order_cnt: specifies the picture order count
++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
++ * reference index for reference picture list 0
++ * that may be used to decode the slice
++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
++ * reference index for reference picture list 1
++ * that may be used to decode the slice
++ * @collocated_ref_idx: specifies the reference index of the collocated picture used
++ * for temporal motion vector prediction
++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
++ * motion vector prediction candidates supported in
++ * the slice subtracted from 5
++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
++ * blocks in the slice
++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
++ * @slice_act_y_qp_offset: screen content extension parameters
++ * @slice_act_cb_qp_offset: screen content extension parameters
++ * @slice_act_cr_qp_offset: screen content extension parameters
++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
++ * more fields
++ * @reserved0: padding field. Should be zeroed by applications.
++ * @slice_segment_addr: specifies the address of the first coding tree block in
++ * the slice segment
++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ * pictures set included in the SPS
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ * pictures set include in the SPS
++ * @pred_weight_table: the prediction weight coefficients for inter-picture
++ * prediction
++ * @reserved1: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_byte_offset;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __s32 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ __u8 reserved0[3];
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u16 short_term_ref_pic_set_size;
++ __u16 long_term_ref_pic_set_size;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u8 reserved1[2];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4
++
++/**
++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
++ *
++ * @pic_order_cnt_val: picture order count
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ * pictures set included in the SPS of the first slice
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ * pictures set include in the SPS of the first slice
++ * @num_active_dpb_entries: the number of entries in dpb
++ * @num_poc_st_curr_before: the number of reference pictures in the short-term
++ * set that come before the current frame
++ * @num_poc_st_curr_after: the number of reference pictures in the short-term
++ * set that come after the current frame
++ * @num_poc_lt_curr: the number of reference pictures in the long-term set
++ * @poc_st_curr_before: provides the index of the short term before references
++ * in DPB array
++ * @poc_st_curr_after: provides the index of the short term after references
++ * in DPB array
++ * @poc_lt_curr: provides the index of the long term references in DPB array
++ * @reserved: padding field. Should be zeroed by applications.
++ * @dpb: the decoded picture buffer, for meta-data about reference frames
++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_decode_params {
++ __s32 pic_order_cnt_val;
++ __u16 short_term_ref_pic_set_size;
++ __u16 long_term_ref_pic_set_size;
++ __u8 num_active_dpb_entries;
++ __u8 num_poc_st_curr_before;
++ __u8 num_poc_st_curr_after;
++ __u8 num_poc_lt_curr;
++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 reserved[4];
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u64 flags;
++};
++
++/**
++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
++ *
++ * @scaling_list_4x4: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_8x8: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_16x16: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_32x32: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process
++ * for transform coefficients. The values on each
++ * scaling list are expected in raster scan order.
++ * @scaling_list_dc_coef_32x32: scaling list is used for the scaling process
++ * for transform coefficients. The values on each
++ * scaling list are expected in raster scan order.
++ */
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+--- a/libavcodec/hevc_parser.c
++++ b/libavcodec/hevc_parser.c
+@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCod
+ avctx->profile = ps->sps->ptl.general_ptl.profile_idc;
+ avctx->level = ps->sps->ptl.general_ptl.level_idc;
+
++ if (ps->sps->chroma_format_idc == 1) {
++ avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ?
++ ps->sps->vui.chroma_sample_loc_type_top_field + 1 :
++ AVCHROMA_LOC_LEFT;
++ }
++ else if (ps->sps->chroma_format_idc == 2 ||
++ ps->sps->chroma_format_idc == 3) {
++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++ }
++ else {
++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++ }
++
+ if (ps->vps->vps_timing_info_present_flag) {
+ num = ps->vps->vps_num_units_in_tick;
+ den = ps->vps->vps_time_scale;
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContex
+ if (!frame->rpl_buf)
+ goto fail;
+
+- frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+- if (!frame->tab_mvf_buf)
+- goto fail;
+- frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++ if (s->tab_mvf_pool) {
++ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++ if (!frame->tab_mvf_buf)
++ goto fail;
++ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++ }
+
+- frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+- if (!frame->rpl_tab_buf)
+- goto fail;
+- frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data;
+- frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+- for (j = 0; j < frame->ctb_count; j++)
+- frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++ if (s->rpl_tab_pool) {
++ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++ if (!frame->rpl_tab_buf)
++ goto fail;
++ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data;
++ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++ for (j = 0; j < frame->ctb_count; j++)
++ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++ }
+
+ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s
+ int ctb_count = frame->ctb_count;
+ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+ int i;
++ RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+
+ if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+ return AVERROR_INVALIDDATA;
+
+- for (i = ctb_addr_ts; i < ctb_count; i++)
+- frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
++ if (frame->rpl_tab) {
++ for (i = ctb_addr_ts; i < ctb_count; i++)
++ frame->rpl_tab[i] = tab;
++ }
+
+- frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
++ frame->refPicList = tab->refPicList;
+
+ return 0;
+ }
+--- a/libavcodec/hevcdec.c
++++ b/libavcodec/hevcdec.c
+@@ -332,6 +332,19 @@ static void export_stream_params(HEVCCon
+
+ ff_set_sar(avctx, sps->vui.sar);
+
++ if (sps->chroma_format_idc == 1) {
++ avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ?
++ sps->vui.chroma_sample_loc_type_top_field + 1 :
++ AVCHROMA_LOC_LEFT;
++ }
++ else if (sps->chroma_format_idc == 2 ||
++ sps->chroma_format_idc == 3) {
++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++ }
++ else {
++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++ }
++
+ if (sps->vui.video_signal_type_present_flag)
+ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
+ : AVCOL_RANGE_MPEG;
+@@ -372,14 +385,20 @@ static enum AVPixelFormat get_format(HEV
+ #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
+ CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
+ CONFIG_HEVC_NVDEC_HWACCEL + \
++ CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
+ CONFIG_HEVC_VAAPI_HWACCEL + \
+ CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
++ CONFIG_HEVC_RPI4_8_HWACCEL + \
++ CONFIG_HEVC_RPI4_10_HWACCEL + \
+ CONFIG_HEVC_VDPAU_HWACCEL)
+ enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+
+ switch (sps->pix_fmt) {
+ case AV_PIX_FMT_YUV420P:
+ case AV_PIX_FMT_YUVJ420P:
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++ *fmt++ = AV_PIX_FMT_RPI4_8;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+ *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -399,8 +418,14 @@ static enum AVPixelFormat get_format(HEV
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+ *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+ #endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++ *fmt++ = AV_PIX_FMT_DRM_PRIME;
++#endif
+ break;
+ case AV_PIX_FMT_YUV420P10:
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++ *fmt++ = AV_PIX_FMT_RPI4_10;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+ *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -417,6 +442,9 @@ static enum AVPixelFormat get_format(HEV
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+ *fmt++ = AV_PIX_FMT_CUDA;
+ #endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++ *fmt++ = AV_PIX_FMT_DRM_PRIME;
++#endif
+ break;
+ case AV_PIX_FMT_YUV444P:
+ #if CONFIG_HEVC_VDPAU_HWACCEL
+@@ -459,6 +487,16 @@ static int set_sps(HEVCContext *s, const
+ if (!sps)
+ return 0;
+
++ // If hwaccel then we don't need all the s/w decode helper arrays
++ if (s->avctx->hwaccel) {
++ export_stream_params(s, sps);
++
++ s->avctx->pix_fmt = pix_fmt;
++ s->ps.sps = sps;
++ s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++ return 0;
++ }
++
+ ret = pic_arrays_init(s, sps);
+ if (ret < 0)
+ goto fail;
+@@ -2809,11 +2847,13 @@ static int hevc_frame_start(HEVCContext
+ ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+ int ret;
+
+- memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+- memset(s->vertical_bs, 0, s->bs_width * s->bs_height);
+- memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+- memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+- memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++ if (s->horizontal_bs) {
++ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++ memset(s->vertical_bs, 0, s->bs_width * s->bs_height);
++ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++ }
+
+ s->is_decoded = 0;
+ s->first_nal_type = s->nal_unit_type;
+@@ -3230,7 +3270,14 @@ static int hevc_decode_frame(AVCodecCont
+ s->ref = NULL;
+ ret = decode_nal_units(s, avpkt->data, avpkt->size);
+ if (ret < 0)
++ {
++ // Ensure that hwaccel knows this frame is over
++ if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
++ s->avctx->hwaccel->abort_frame(s->avctx);
++ }
++
+ return ret;
++ }
+
+ if (avctx->hwaccel) {
+ if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
+@@ -3273,15 +3320,19 @@ static int hevc_ref_frame(HEVCContext *s
+ if (ret < 0)
+ return ret;
+
+- dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+- if (!dst->tab_mvf_buf)
+- goto fail;
+- dst->tab_mvf = src->tab_mvf;
++ if (src->tab_mvf_buf) {
++ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++ if (!dst->tab_mvf_buf)
++ goto fail;
++ dst->tab_mvf = src->tab_mvf;
++ }
+
+- dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+- if (!dst->rpl_tab_buf)
+- goto fail;
+- dst->rpl_tab = src->rpl_tab;
++ if (src->rpl_tab_buf) {
++ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++ if (!dst->rpl_tab_buf)
++ goto fail;
++ dst->rpl_tab = src->rpl_tab;
++ }
+
+ dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+ if (!dst->rpl_buf)
+@@ -3585,6 +3636,15 @@ AVCodec ff_hevc_decoder = {
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+ HWACCEL_NVDEC(hevc),
+ #endif
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++ HWACCEL_RPI4_8(hevc),
++#endif
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++ HWACCEL_RPI4_10(hevc),
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++ HWACCEL_V4L2REQUEST(hevc),
++#endif
+ #if CONFIG_HEVC_VAAPI_HWACCEL
+ HWACCEL_VAAPI(hevc),
+ #endif
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -34,6 +34,9 @@ extern const AVHWAccel ff_hevc_d3d11va_h
+ extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+ extern const AVHWAccel ff_hevc_nvdec_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
++extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
+ extern const AVHWAccel ff_hevc_vaapi_hwaccel;
+ extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+ extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+--- a/libavcodec/hwconfig.h
++++ b/libavcodec/hwconfig.h
+@@ -24,6 +24,7 @@
+
+
+ #define HWACCEL_CAP_ASYNC_SAFE (1 << 0)
++#define HWACCEL_CAP_MT_SAFE (1 << 1)
+
+
+ typedef struct AVCodecHWConfigInternal {
+@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal {
+ HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel)
+ #define HWACCEL_NVDEC(codec) \
+ HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel)
++#define HWACCEL_RPI4_8(codec) \
++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8, NONE, ff_ ## codec ## _rpi4_8_hwaccel)
++#define HWACCEL_RPI4_10(codec) \
++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10, NONE, ff_ ## codec ## _rpi4_10_hwaccel)
++#define HWACCEL_V4L2REQUEST(codec) \
++ HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel)
+ #define HWACCEL_VAAPI(codec) \
+ HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel)
+ #define HWACCEL_VDPAU(codec) \
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
+@@ -24,6 +24,9 @@
+ * MMAL Video Decoder
+ */
+
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ #include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/mmal_parameters_video.h>
+@@ -31,6 +34,7 @@
+ #include <interface/mmal/util/mmal_util_params.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/mmal/vc/mmal_vc_api.h>
++#pragma GCC diagnostic pop
+ #include <stdatomic.h>
+
+ #include "avcodec.h"
+--- a/libavcodec/pthread_frame.c
++++ b/libavcodec/pthread_frame.c
+@@ -191,7 +191,8 @@ static attribute_align_arg void *frame_w
+
+ /* if the previous thread uses hwaccel then we take the lock to ensure
+ * the threads don't run concurrently */
+- if (avctx->hwaccel) {
++ if (avctx->hwaccel &&
++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+ pthread_mutex_lock(&p->parent->hwaccel_mutex);
+ p->hwaccel_serializing = 1;
+ }
+@@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecConte
+
+ if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
+
+- if (avctx->hwaccel && !p->hwaccel_serializing) {
++ if (avctx->hwaccel &&
++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
++ !p->hwaccel_serializing) {
+ pthread_mutex_lock(&p->parent->hwaccel_mutex);
+ p->hwaccel_serializing = 1;
+ }
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -293,6 +293,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags
+ { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */
+ { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+
++ /* RPI (Might as well define for everything) */
++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') },
++ { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') },
++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') },
++ { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') },
++
+ { AV_PIX_FMT_NONE, 0 },
+ };
+
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -24,6 +24,7 @@
+ * Raw Video Encoder
+ */
+
++#include "config.h"
+ #include "avcodec.h"
+ #include "raw.h"
+ #include "internal.h"
+@@ -31,6 +32,10 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#if CONFIG_SAND
++#include "libavutil/rpi_sand_fns.h"
++#endif
+
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS
+ return 0;
+ }
+
++#if CONFIG_SAND
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++ const AVFrame *frame)
++{
++ const int width = av_frame_cropped_width(frame);
++ const int height = av_frame_cropped_height(frame);
++ const int x0 = frame->crop_left;
++ const int y0 = frame->crop_top;
++ const int size = width * height * 3 / 2;
++ uint8_t * dst;
++ int ret;
++
++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++ return ret;
++
++ dst = pkt->data;
++
++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++ dst += width * height;
++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
++ return 0;
++}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++ const AVFrame *frame)
++{
++ const int width = av_frame_cropped_width(frame);
++ const int height = av_frame_cropped_height(frame);
++ const int x0 = frame->crop_left;
++ const int y0 = frame->crop_top;
++ const int size = width * height * 3;
++ uint8_t * dst;
++ int ret;
++
++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++ return ret;
++
++ dst = pkt->data;
++
++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++ dst += width * height * 2;
++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++ return 0;
++}
++
++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++ const AVFrame *frame)
++{
++ const int width = av_frame_cropped_width(frame);
++ const int height = av_frame_cropped_height(frame);
++ const int x0 = frame->crop_left;
++ const int y0 = frame->crop_top;
++ const int size = width * height * 3;
++ uint8_t * dst;
++ int ret;
++
++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++ return ret;
++
++ dst = pkt->data;
++
++ av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++ dst += width * height * 2;
++ av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
++ return 0;
++}
++#endif
++
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+- const AVFrame *frame, int *got_packet)
++ const AVFrame *src_frame, int *got_packet)
+ {
+- int ret = av_image_get_buffer_size(frame->format,
+- frame->width, frame->height, 1);
++ int ret;
++ AVFrame * frame = NULL;
+
+- if (ret < 0)
++#if CONFIG_SAND
++ if (av_rpi_is_sand_frame(src_frame)) {
++ ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
++ av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
++ av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
++ *got_packet = (ret == 0);
+ return ret;
++ }
++#endif
++
++ if ((frame = av_frame_clone(src_frame)) == NULL) {
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
++
++ if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
++ goto fail;
++
++ ret = av_image_get_buffer_size(frame->format,
++ frame->width, frame->height, 1);
++ if (ret < 0)
++ goto fail;
+
+ if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+- return ret;
++ goto fail;
+ if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+ (const uint8_t **)frame->data, frame->linesize,
+ frame->format,
+ frame->width, frame->height, 1)) < 0)
+- return ret;
++ goto fail;
+
+ if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
+ frame->format == AV_PIX_FMT_YUYV422) {
+@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *av
+ }
+ }
+ pkt->flags |= AV_PKT_FLAG_KEY;
++ av_frame_free(&frame);
+ *got_packet = 1;
+ return 0;
++
++fail:
++ av_frame_free(&frame);
++ *got_packet = 0;
++ return ret;
+ }
+
+ AVCodec ff_rawvideo_encoder = {
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac.c
+@@ -0,0 +1,2257 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#define UNCHECKED_BITSTREAM_READER 1
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++
++#include "cabac_functions.h"
++#include "rpi_hevc_data.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++
++#include "libavutil/rpi_sand_fns.h"
++
++// BY22 is probably faster than simple bypass if the processor has
++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
++// x86 has fast int divide
++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
++// Use native divide if we have a fast one - otherwise use mpy 1/x
++// x86 has a fast integer divide - arm doesn't - unsure about other
++// architectures
++#define USE_BY22_DIV ARCH_X86
++
++// Special case blocks with a single significant ceoff
++// Decreases the complexity of the code for a common case but increases the
++// code size.
++#define USE_N_END_1 1
++
++#if !USE_BY22_DIV
++// * 1/x @ 32 bits gets us 22 bits of accuracy
++#define CABAC_BY22_PEEK_BITS 22
++#else
++// A real 32-bit divide gets us another bit
++// If we have a 64 bit int & a unit time divider then we should get a lot
++// of bits (55) but that is untested and it is unclear if it would give
++// us a large advantage
++#define CABAC_BY22_PEEK_BITS 23
++#endif
++
++#define CABAC_MAX_BIN 31
++
++
++#if USE_BY22 && !USE_BY22_DIV
++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
++
++static const uint32_t cabac_by22_inv_range[256] = {
++ 0, I(257), I(258), I(259),
++ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
++ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
++ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
++ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
++ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
++ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
++ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
++ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
++ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
++ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
++ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
++ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
++ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
++ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
++ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
++ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
++ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
++ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
++ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
++ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
++ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
++ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
++ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
++ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
++ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
++ I(510), I(511)
++};
++#undef I
++#endif // USE_BY22
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_cabac.h"
++#endif
++
++/**
++ * number of bin by SyntaxElement.
++ */
++static const int8_t num_bins_in_se[] = {
++ 1, // sao_merge_flag
++ 1, // sao_type_idx
++ 0, // sao_eo_class
++ 0, // sao_band_position
++ 0, // sao_offset_abs
++ 0, // sao_offset_sign
++ 0, // end_of_slice_flag
++ 3, // split_coding_unit_flag
++ 1, // cu_transquant_bypass_flag
++ 3, // skip_flag
++ 3, // cu_qp_delta
++ 1, // pred_mode
++ 4, // part_mode
++ 0, // pcm_flag
++ 1, // prev_intra_luma_pred_mode
++ 0, // mpm_idx
++ 0, // rem_intra_luma_pred_mode
++ 2, // intra_chroma_pred_mode
++ 1, // merge_flag
++ 1, // merge_idx
++ 5, // inter_pred_idc
++ 2, // ref_idx_l0
++ 2, // ref_idx_l1
++ 2, // abs_mvd_greater0_flag
++ 2, // abs_mvd_greater1_flag
++ 0, // abs_mvd_minus2
++ 0, // mvd_sign_flag
++ 1, // mvp_lx_flag
++ 1, // no_residual_data_flag
++ 3, // split_transform_flag
++ 2, // cbf_luma
++ 4, // cbf_cb, cbf_cr
++ 2, // transform_skip_flag[][]
++ 2, // explicit_rdpcm_flag[][]
++ 2, // explicit_rdpcm_dir_flag[][]
++ 18, // last_significant_coeff_x_prefix
++ 18, // last_significant_coeff_y_prefix
++ 0, // last_significant_coeff_x_suffix
++ 0, // last_significant_coeff_y_suffix
++ 4, // significant_coeff_group_flag
++ 44, // significant_coeff_flag
++ 24, // coeff_abs_level_greater1_flag
++ 6, // coeff_abs_level_greater2_flag
++ 0, // coeff_abs_level_remaining
++ 0, // coeff_sign_flag
++ 8, // log2_res_scale_abs
++ 2, // res_scale_sign_flag
++ 1, // cu_chroma_qp_offset_flag
++ 1, // cu_chroma_qp_offset_idx
++};
++
++/**
++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
++ */
++static const int elem_offset[sizeof(num_bins_in_se)] = {
++ 0, // sao_merge_flag
++ 1, // sao_type_idx
++ 2, // sao_eo_class
++ 2, // sao_band_position
++ 2, // sao_offset_abs
++ 2, // sao_offset_sign
++ 2, // end_of_slice_flag
++ 2, // split_coding_unit_flag
++ 5, // cu_transquant_bypass_flag
++ 6, // skip_flag
++ 9, // cu_qp_delta
++ 12, // pred_mode
++ 13, // part_mode
++ 17, // pcm_flag
++ 17, // prev_intra_luma_pred_mode
++ 18, // mpm_idx
++ 18, // rem_intra_luma_pred_mode
++ 18, // intra_chroma_pred_mode
++ 20, // merge_flag
++ 21, // merge_idx
++ 22, // inter_pred_idc
++ 27, // ref_idx_l0
++ 29, // ref_idx_l1
++ 31, // abs_mvd_greater0_flag
++ 33, // abs_mvd_greater1_flag
++ 35, // abs_mvd_minus2
++ 35, // mvd_sign_flag
++ 35, // mvp_lx_flag
++ 36, // no_residual_data_flag
++ 37, // split_transform_flag
++ 40, // cbf_luma
++ 42, // cbf_cb, cbf_cr
++ 46, // transform_skip_flag[][]
++ 48, // explicit_rdpcm_flag[][]
++ 50, // explicit_rdpcm_dir_flag[][]
++ 52, // last_significant_coeff_x_prefix
++ 70, // last_significant_coeff_y_prefix
++ 88, // last_significant_coeff_x_suffix
++ 88, // last_significant_coeff_y_suffix
++ 88, // significant_coeff_group_flag
++ 92, // significant_coeff_flag
++ 136, // coeff_abs_level_greater1_flag
++ 160, // coeff_abs_level_greater2_flag
++ 166, // coeff_abs_level_remaining
++ 166, // coeff_sign_flag
++ 166, // log2_res_scale_abs
++ 174, // res_scale_sign_flag
++ 176, // cu_chroma_qp_offset_flag
++ 177, // cu_chroma_qp_offset_idx
++};
++
++#define CNU 154
++/**
++ * Indexed by init_type
++ */
++static const uint8_t init_values[3][HEVC_CONTEXTS] = {
++ { // sao_merge_flag
++ 153,
++ // sao_type_idx
++ 200,
++ // split_coding_unit_flag
++ 139, 141, 157,
++ // cu_transquant_bypass_flag
++ 154,
++ // skip_flag
++ CNU, CNU, CNU,
++ // cu_qp_delta
++ 154, 154, 154,
++ // pred_mode
++ CNU,
++ // part_mode
++ 184, CNU, CNU, CNU,
++ // prev_intra_luma_pred_mode
++ 184,
++ // intra_chroma_pred_mode
++ 63, 139,
++ // merge_flag
++ CNU,
++ // merge_idx
++ CNU,
++ // inter_pred_idc
++ CNU, CNU, CNU, CNU, CNU,
++ // ref_idx_l0
++ CNU, CNU,
++ // ref_idx_l1
++ CNU, CNU,
++ // abs_mvd_greater1_flag
++ CNU, CNU,
++ // abs_mvd_greater1_flag
++ CNU, CNU,
++ // mvp_lx_flag
++ CNU,
++ // no_residual_data_flag
++ CNU,
++ // split_transform_flag
++ 153, 138, 138,
++ // cbf_luma
++ 111, 141,
++ // cbf_cb, cbf_cr
++ 94, 138, 182, 154,
++ // transform_skip_flag
++ 139, 139,
++ // explicit_rdpcm_flag
++ 139, 139,
++ // explicit_rdpcm_dir_flag
++ 139, 139,
++ // last_significant_coeff_x_prefix
++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++ 79, 108, 123, 63,
++ // last_significant_coeff_y_prefix
++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++ 79, 108, 123, 63,
++ // significant_coeff_group_flag
++ 91, 171, 134, 141,
++ // significant_coeff_flag
++ 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153,
++ 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
++ 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
++ 141, 111,
++ // coeff_abs_level_greater1_flag
++ 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107,
++ 122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
++ // coeff_abs_level_greater2_flag
++ 138, 153, 136, 167, 152, 152,
++ // log2_res_scale_abs
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ // res_scale_sign_flag
++ 154, 154,
++ // cu_chroma_qp_offset_flag
++ 154,
++ // cu_chroma_qp_offset_idx
++ 154,
++ },
++ { // sao_merge_flag
++ 153,
++ // sao_type_idx
++ 185,
++ // split_coding_unit_flag
++ 107, 139, 126,
++ // cu_transquant_bypass_flag
++ 154,
++ // skip_flag
++ 197, 185, 201,
++ // cu_qp_delta
++ 154, 154, 154,
++ // pred_mode
++ 149,
++ // part_mode
++ 154, 139, 154, 154,
++ // prev_intra_luma_pred_mode
++ 154,
++ // intra_chroma_pred_mode
++ 152, 139,
++ // merge_flag
++ 110,
++ // merge_idx
++ 122,
++ // inter_pred_idc
++ 95, 79, 63, 31, 31,
++ // ref_idx_l0
++ 153, 153,
++ // ref_idx_l1
++ 153, 153,
++ // abs_mvd_greater1_flag
++ 140, 198,
++ // abs_mvd_greater1_flag
++ 140, 198,
++ // mvp_lx_flag
++ 168,
++ // no_residual_data_flag
++ 79,
++ // split_transform_flag
++ 124, 138, 94,
++ // cbf_luma
++ 153, 111,
++ // cbf_cb, cbf_cr
++ 149, 107, 167, 154,
++ // transform_skip_flag
++ 139, 139,
++ // explicit_rdpcm_flag
++ 139, 139,
++ // explicit_rdpcm_dir_flag
++ 139, 139,
++ // last_significant_coeff_x_prefix
++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95,
++ 94, 108, 123, 108,
++ // last_significant_coeff_y_prefix
++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95,
++ 94, 108, 123, 108,
++ // significant_coeff_group_flag
++ 121, 140, 61, 154,
++ // significant_coeff_flag
++ 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153,
++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++ 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
++ 140, 140,
++ // coeff_abs_level_greater1_flag
++ 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++ 136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
++ // coeff_abs_level_greater2_flag
++ 107, 167, 91, 122, 107, 167,
++ // log2_res_scale_abs
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ // res_scale_sign_flag
++ 154, 154,
++ // cu_chroma_qp_offset_flag
++ 154,
++ // cu_chroma_qp_offset_idx
++ 154,
++ },
++ { // sao_merge_flag
++ 153,
++ // sao_type_idx
++ 160,
++ // split_coding_unit_flag
++ 107, 139, 126,
++ // cu_transquant_bypass_flag
++ 154,
++ // skip_flag
++ 197, 185, 201,
++ // cu_qp_delta
++ 154, 154, 154,
++ // pred_mode
++ 134,
++ // part_mode
++ 154, 139, 154, 154,
++ // prev_intra_luma_pred_mode
++ 183,
++ // intra_chroma_pred_mode
++ 152, 139,
++ // merge_flag
++ 154,
++ // merge_idx
++ 137,
++ // inter_pred_idc
++ 95, 79, 63, 31, 31,
++ // ref_idx_l0
++ 153, 153,
++ // ref_idx_l1
++ 153, 153,
++ // abs_mvd_greater1_flag
++ 169, 198,
++ // abs_mvd_greater1_flag
++ 169, 198,
++ // mvp_lx_flag
++ 168,
++ // no_residual_data_flag
++ 79,
++ // split_transform_flag
++ 224, 167, 122,
++ // cbf_luma
++ 153, 111,
++ // cbf_cb, cbf_cr
++ 149, 92, 167, 154,
++ // transform_skip_flag
++ 139, 139,
++ // explicit_rdpcm_flag
++ 139, 139,
++ // explicit_rdpcm_dir_flag
++ 139, 139,
++ // last_significant_coeff_x_prefix
++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111,
++ 79, 108, 123, 93,
++ // last_significant_coeff_y_prefix
++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111,
++ 79, 108, 123, 93,
++ // significant_coeff_group_flag
++ 121, 140, 61, 154,
++ // significant_coeff_flag
++ 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153,
++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++ 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
++ 140, 140,
++ // coeff_abs_level_greater1_flag
++ 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++ 136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
++ // coeff_abs_level_greater2_flag
++ 107, 167, 91, 107, 107, 167,
++ // log2_res_scale_abs
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ // res_scale_sign_flag
++ 154, 154,
++ // cu_chroma_qp_offset_flag
++ 154,
++ // cu_chroma_qp_offset_idx
++ 154,
++ },
++};
++
++static const uint8_t scan_1x1[1] = {
++ 0,
++};
++
++static const uint8_t horiz_scan2x2_x[4] = {
++ 0, 1, 0, 1,
++};
++
++static const uint8_t horiz_scan2x2_y[4] = {
++ 0, 0, 1, 1
++};
++
++static const uint8_t horiz_scan4x4_x[16] = {
++ 0, 1, 2, 3,
++ 0, 1, 2, 3,
++ 0, 1, 2, 3,
++ 0, 1, 2, 3,
++};
++
++static const uint8_t horiz_scan4x4_y[16] = {
++ 0, 0, 0, 0,
++ 1, 1, 1, 1,
++ 2, 2, 2, 2,
++ 3, 3, 3, 3,
++};
++
++static const uint8_t horiz_scan8x8_inv[8][8] = {
++ { 0, 1, 2, 3, 16, 17, 18, 19, },
++ { 4, 5, 6, 7, 20, 21, 22, 23, },
++ { 8, 9, 10, 11, 24, 25, 26, 27, },
++ { 12, 13, 14, 15, 28, 29, 30, 31, },
++ { 32, 33, 34, 35, 48, 49, 50, 51, },
++ { 36, 37, 38, 39, 52, 53, 54, 55, },
++ { 40, 41, 42, 43, 56, 57, 58, 59, },
++ { 44, 45, 46, 47, 60, 61, 62, 63, },
++};
++
++static const uint8_t diag_scan2x2_x[4] = {
++ 0, 0, 1, 1,
++};
++
++static const uint8_t diag_scan2x2_y[4] = {
++ 0, 1, 0, 1,
++};
++
++static const uint8_t diag_scan2x2_inv[2][2] = {
++ { 0, 2, },
++ { 1, 3, },
++};
++
++static const uint8_t diag_scan4x4_inv[4][4] = {
++ { 0, 2, 5, 9, },
++ { 1, 4, 8, 12, },
++ { 3, 7, 11, 14, },
++ { 6, 10, 13, 15, },
++};
++
++static const uint8_t diag_scan8x8_inv[8][8] = {
++ { 0, 2, 5, 9, 14, 20, 27, 35, },
++ { 1, 4, 8, 13, 19, 26, 34, 42, },
++ { 3, 7, 12, 18, 25, 33, 41, 48, },
++ { 6, 11, 17, 24, 32, 40, 47, 53, },
++ { 10, 16, 23, 31, 39, 46, 52, 57, },
++ { 15, 22, 30, 38, 45, 51, 56, 60, },
++ { 21, 29, 37, 44, 50, 55, 59, 62, },
++ { 28, 36, 43, 49, 54, 58, 61, 63, },
++};
++
++
++typedef struct
++{
++ uint16_t coeff;
++ uint16_t scale;
++} xy_off_t;
++
++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
++
++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
++
++#define OFF_DIAG(t) {\
++ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
++ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
++ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
++ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++#define OFF_HORIZ(t) {\
++ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
++ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
++ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
++ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
++}
++
++#define OFF_VERT(t) {\
++ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
++ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
++ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
++ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++static const xy_off_t off_xys[3][4][16] =
++{
++ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
++ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
++ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
++};
++
++
++// Helper fns
++#ifndef hevc_mem_bits32
++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
++{
++ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
++}
++#endif
++
++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
++#define hevc_clz32 hevc_clz32_builtin
++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
++{
++ // __builtin_clz says it works on ints - so adjust if int is >32 bits long
++ return __builtin_clz(x) - (sizeof(int) * 8 - 32);
++}
++#endif
++
++// It is unlikely that we will ever need this but include for completeness
++#ifndef hevc_clz32
++static inline unsigned int hevc_clz32(unsigned int x)
++{
++ unsigned int n = 1;
++ if ((x & 0xffff0000) == 0) {
++ n += 16;
++ x <<= 16;
++ }
++ if ((x & 0xff000000) == 0) {
++ n += 8;
++ x <<= 8;
++ }
++ if ((x & 0xf0000000) == 0) {
++ n += 4;
++ x <<= 4;
++ }
++ if ((x & 0xc0000000) == 0) {
++ n += 2;
++ x <<= 2;
++ }
++ return n - ((x >> 31) & 1);
++}
++#endif
++
++static inline int cabac_overflow(const CABACContext * const cc)
++{
++ av_assert0(cc->bytestream >= cc->bytestream_start);
++ return cc->bytestream >= cc->bytestream_end + 4;
++}
++
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
++{
++ return cabac_overflow(&lc->cc);
++}
++
++#if !USE_BY22
++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
++// will no longer be called but the setup calls will still exist and we want
++// to null them out
++#define bypass_start(s)
++#define bypass_finish(s)
++#else
++// Use BY22 for residual bypass block
++
++#define bypass_start(cc) get_cabac_by22_start(cc)
++#define bypass_finish(cc) get_cabac_by22_finish(cc)
++
++// BY22 notes that bypass is simply a divide into the bitstream and so we
++// can peek out large quantities of bits at once and treat the result as if
++// it was VLC. In many cases this will lead to O(1) processing rather than
++// O(n) though the setup and teardown is sufficiently expensive that it is
++// only worth using if we expect to be dealing with more than a few bits
++// The definition of "a few bits" will vary from platform to platform but
++// tests on ARM show that it probably isn't worth it for a single coded
++// residual, but is for >1 - it also seems likely that if there are
++// more residuals then they are likely to be bigger and this will make the
++// O(1) nature of the code more worthwhile.
++
++
++// Bypass block start
++// Must be called before _by22_peek is used as it sets the CABAC environment
++// into the correct state. _by22_finish must be called to return to 'normal'
++// (i.e. non-bypass) cabac decoding
++#ifndef get_cabac_by22_start
++static inline void get_cabac_by22_start(CABACContext * const c)
++{
++ const unsigned int bits = __builtin_ctz(c->low);
++ const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
++ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
++#if !USE_BY22_DIV
++ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
++#endif
++
++ c->bytestream -= (CABAC_BITS / 8);
++ c->by22.bits = bits;
++#if !USE_BY22_DIV
++ c->by22.range = c->range;
++ c->range = inv;
++#endif
++ c->low = x;
++}
++#endif
++
++// Bypass block finish
++// Must be called at the end of the bypass block to return to normal operation
++static inline void get_cabac_by22_finish(CABACContext * const c)
++{
++ unsigned int used = c->by22.bits;
++ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
++ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
++
++ c->bytestream += bytes_used + (CABAC_BITS / 8);
++ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
++#if !USE_BY22_DIV
++ c->range = c->by22.range;
++#endif
++}
++
++// Peek bypass bits
++// _by22_start must be called before _by22_peek is called and _by22_flush
++// must be called afterwards to flush any used bits
++// The actual number of valid bits returned is
++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
++// will be at least 22 which should be long enough for any prefix or suffix
++// though probably not long enough for the worst case combination
++#ifndef get_cabac_by22_peek
++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
++{
++#if USE_BY22_DIV
++ return ((unsigned int)c->low / (unsigned int)c->range) << 9;
++#else
++ uint32_t x = c->low & ~1U;
++ const uint32_t inv = c->range;
++
++ if (inv != 0)
++ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
++
++ return x << 1;
++#endif
++}
++#endif
++
++// Flush bypass bits peeked by _by22_peek
++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
++// val is an unmodified copy of whatever _by22_peek returned
++#ifndef get_cabac_by22_flush
++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
++{
++ // Subtract the bits used & reshift up to the top of the word
++#if USE_BY22_DIV
++ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
++#else
++ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
++#endif
++
++ // and refill lower bits
++ // We will probably OR over some existing bits but that doesn't matter
++ c->by22.bits += n;
++ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
++}
++#endif
++
++#endif // USE_BY22
++
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
++{
++ memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
++ memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
++}
++
++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
++ memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
++}
++
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
++{
++ GetBitContext * const gb = &lc->gb;
++ skip_bits(gb, 1);
++ align_get_bits(gb);
++ return ff_init_cabac_decoder(&lc->cc,
++ gb->buffer + get_bits_count(gb) / 8,
++ (get_bits_left(gb) + 7) / 8);
++}
++
++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int init_type = 2 - s->sh.slice_type;
++ int i;
++
++ if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
++ init_type ^= 3;
++
++ for (i = 0; i < HEVC_CONTEXTS; i++) {
++ int init_value = init_values[init_type][i];
++ int m = (init_value >> 4) * 5 - 45;
++ int n = ((init_value & 15) << 3) - 16;
++ int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
++
++ pre ^= pre >> 31;
++ if (pre > 124)
++ pre = 124 + (pre & 1);
++ lc->cabac_state[i] = pre;
++ }
++
++ for (i = 0; i < 4; i++)
++ lc->stat_coeff[i] = 0;
++}
++
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
++{
++ if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
++ {
++ lc->qPy_pred = s->sh.slice_qp;
++ cabac_init_state(s, lc);
++ }
++ else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
++ {
++ lc->qPy_pred = s->sh.slice_qp;
++ load_states(s, lc);
++ }
++ lc->cabac_init_req = 0;
++}
++
++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
++{
++ return get_cabac_inline(c, state);
++}
++
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
++{
++ return get_cabac_terminate(c);
++}
++
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
++{
++ if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
++ return 0;
++
++ if (!get_cabac_bypass(&lc->cc))
++ return SAO_BAND;
++ return SAO_EDGE;
++}
++
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
++{
++ int i;
++ int value = get_cabac_bypass(&lc->cc);
++
++ for (i = 0; i < 4; i++)
++ value = (value << 1) | get_cabac_bypass(&lc->cc);
++ return value;
++}
++
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int i = 0;
++ int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
++
++ while (i < length && get_cabac_bypass(&lc->cc))
++ i++;
++ return i;
++}
++
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
++{
++ return get_cabac_bypass(&lc->cc);
++}
++
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
++{
++ int ret = get_cabac_bypass(&lc->cc) << 1;
++ ret |= get_cabac_bypass(&lc->cc);
++ return ret;
++}
++
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
++{
++ int val = 1;
++
++ if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
++ return 0;
++
++ while (val < 5 &&
++ get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
++ val++;
++
++ if (val >= 5) {
++ unsigned int k = 0;
++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++ val += 1 << k;
++ k++;
++ }
++// if (k == CABAC_MAX_BIN)
++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++
++ while (k--)
++ val += get_cabac_bypass(&lc->cc) << k;
++ }
++ return get_cabac_bypass(&lc->cc) ? -val : val;
++}
++
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
++ int i = 0;
++
++ while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
++ i++;
++
++ return i;
++}
++
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
++{
++ if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
++ return PART_2Nx2N;
++ if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
++ if (lc->cu.pred_mode == MODE_INTRA) // 0
++ return PART_NxN;
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++ return PART_2NxN;
++ if (log2_cb_size == 3) // 00
++ return PART_Nx2N;
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
++ return PART_Nx2N;
++ return PART_NxN; // 000
++ }
++
++ if (!s->ps.sps->amp_enabled_flag) {
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++ return PART_2NxN;
++ return PART_Nx2N;
++ }
++
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
++ return PART_2NxN;
++ if (get_cabac_bypass(&lc->cc)) // 0101
++ return PART_2NxnD;
++ return PART_2NxnU; // 0100
++ }
++
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
++ return PART_Nx2N;
++ if (get_cabac_bypass(&lc->cc)) // 0001
++ return PART_nRx2N;
++ return PART_nLx2N; // 0000
++}
++
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
++{
++ int i = 0;
++ while (i < 2 && get_cabac_bypass(&lc->cc))
++ i++;
++ return i;
++}
++
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++ int i;
++ int value = get_cabac_bypass(&lc->cc);
++
++ for (i = 0; i < 4; i++)
++ value = (value << 1) | get_cabac_bypass(&lc->cc);
++ return value;
++}
++
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++ int ret;
++ if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
++ return 4;
++
++ ret = get_cabac_bypass(&lc->cc) << 1;
++ ret |= get_cabac_bypass(&lc->cc);
++ return ret;
++}
++
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
++
++ if (i != 0) {
++ while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
++ i++;
++ }
++ return i;
++}
++
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
++{
++ if (nPbW + nPbH == 12)
++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++ if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
++ return PRED_BI;
++
++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++}
++
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
++{
++ int i = 0;
++ int max = num_ref_idx_lx - 1;
++ int max_ctx = FFMIN(max, 2);
++
++ while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
++ i++;
++ if (i == 2) {
++ while (i < max && get_cabac_bypass(&lc->cc))
++ i++;
++ }
++
++ return i;
++}
++
++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
++}
++
++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
++}
++
++#if !USE_BY22
++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
++{
++ int ret = 2;
++ int k = 1;
++
++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++ ret += 1U << k;
++ k++;
++ }
++ if (k == CABAC_MAX_BIN) {
++ av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++ return 0;
++ }
++
++ while (k--)
++ ret += get_cabac_bypass(&lc->cc) << k;
++ return get_cabac_bypass_sign(&lc->cc, -ret);
++}
++#endif
++
++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return get_cabac_bypass_sign(&lc->cc, -1);
++}
++
++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++ return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++}
++
++
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
++ int i =0;
++
++ while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
++ i++;
++
++ return i;
++}
++
++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
++ int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++{
++ int i = 0;
++ int max = (log2_size << 1) - 1;
++ int ctx_offset, ctx_shift;
++
++ if (!c_idx_nz) {
++ ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2);
++ ctx_shift = (log2_size + 1) >> 2;
++ } else {
++ ctx_offset = 15;
++ ctx_shift = log2_size - 2;
++ }
++ while (i < max &&
++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
++ i++;
++ *last_scx_prefix = i;
++
++ i = 0;
++ while (i < max &&
++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
++ i++;
++ *last_scy_prefix = i;
++}
++
++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
++ int last_significant_coeff_prefix)
++{
++ int i;
++ int length = (last_significant_coeff_prefix >> 1) - 1;
++ int value = get_cabac_bypass(&lc->cc);
++
++ for (i = 1; i < length; i++)
++ value = (value << 1) | get_cabac_bypass(&lc->cc);
++ return value;
++}
++
++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
++{
++ int inc;
++
++ inc = (ctx_cg != 0) + (c_idx_nz << 1);
++
++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++}
++
++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
++{
++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++}
++
++#if !USE_BY22
++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
++#endif
++
++
++#ifndef coeff_abs_level_remaining_decode_bypass
++static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
++{
++ uint32_t y;
++ unsigned int prefix;
++ unsigned int last_coeff_abs_level_remaining;
++ unsigned int n;
++
++ y = get_cabac_by22_peek(c);
++ prefix = hevc_clz32(~y);
++ // y << prefix will always have top bit 0
++
++ if (prefix < 3) {
++ const unsigned int suffix = (y << prefix) >> (31 - rice_param);
++ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
++ n = prefix + 1 + rice_param;
++ }
++ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
++ {
++ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
++
++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++ n = prefix * 2 + rice_param - 2;
++ }
++ else {
++ unsigned int suffix;
++
++ get_cabac_by22_flush(c, prefix, y);
++ y = get_cabac_by22_peek(c);
++
++ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++ n = prefix + rice_param - 2;
++ }
++
++ get_cabac_by22_flush(c, n, y);
++
++ return last_coeff_abs_level_remaining;
++}
++#endif
++
++static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
++{
++ int prefix = 0;
++ int suffix = 0;
++ int last_coeff_abs_level_remaining;
++ int i;
++
++ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++ prefix++;
++ if (prefix == CABAC_MAX_BIN) {
++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++ return 0;
++ }
++
++ if (prefix < 3) {
++ for (i = 0; i < rc_rice_param; i++)
++ suffix = (suffix << 1) | get_cabac_bypass(c);
++ last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++ } else {
++ int prefix_minus3 = prefix - 3;
++ for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++ suffix = (suffix << 1) | get_cabac_bypass(c);
++ last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++ << rc_rice_param) + suffix;
++ }
++
++ return last_coeff_abs_level_remaining;
++}
++
++#if !USE_BY22
++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
++static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
++{
++ unsigned int i;
++ uint32_t ret = 0;
++
++ for (i = 0; i < nb; i++)
++ ret = (ret << 1) | get_cabac_bypass(c);
++
++ return ret << (32 - nb);
++}
++#endif
++
++#ifndef coeff_sign_flag_decode_bypass
++static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
++{
++ uint32_t y;
++ y = get_cabac_by22_peek(c);
++ get_cabac_by22_flush(c, nb, y);
++ return y & ~(0xffffffffU >> nb);
++}
++#endif
++
++
++#ifndef get_cabac_greater1_bits
++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
++ uint8_t * const state0)
++{
++ unsigned int i;
++ unsigned int rv = 0;
++ for (i = 0; i != n; ++i) {
++ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
++ const unsigned int b = get_cabac(c, state0 + idx);
++ rv = (rv << 1) | b;
++ }
++ return rv;
++}
++#endif
++
++
++// N.B. levels returned are the values assuming coeff_abs_level_remaining
++// is uncoded, so 1 must be added if it is coded. sum_abs also reflects
++// this version of events.
++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
++ int * const pprev_subset_coded, int * const psum,
++ const unsigned int idx0_gt1, const unsigned int idx_gt2)
++{
++ CABACContext * const c = &lc->cc;
++ uint8_t * const state0 = lc->cabac_state + idx0_gt1;
++ uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
++ unsigned int rv;
++ unsigned int i;
++ const unsigned int n = FFMIN(n_end, 8);
++
++ // Really this is i != n but the simple unconditional loop is cheaper
++ // and faster
++ for (i = 0; i != 8; ++i)
++ levels[i] = 1;
++
++ rv = get_cabac_greater1_bits(c, n, state0);
++
++ *pprev_subset_coded = 0;
++ *psum = n;
++
++ rv <<= (32 - n);
++ if (rv != 0)
++ {
++ *pprev_subset_coded = 1;
++ *psum = n + 1;
++ i = hevc_clz32(rv);
++ levels[i] = 2;
++ if (get_cabac(c, state_gt2) == 0)
++ {
++ // Unset first coded bit
++ rv &= ~(0x80000000U >> i);
++ }
++ }
++
++ if (n_end > 8) {
++ const unsigned int g8 = n_end - 8;
++ rv |= ((1 << g8) - 1) << (24 - g8);
++ for (i = 0; i != g8; ++i) {
++ levels[i + 8] = 0;
++ }
++ }
++
++ return rv;
++}
++
++// extended_precision_processing_flag must be false given we are
++// putting the result into a 16-bit array
++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
++// scale_m is uint8_t
++//
++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
++// or it can be 2 (if we have transquant_bypass)
++// shift is set to one less than we really want but would normally be
++// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
++// to achieve it
++
++#ifndef trans_scale_sat
++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
++}
++#endif
++
++
++#ifndef update_rice
++static inline void update_rice(uint8_t * const stat_coeff,
++ const unsigned int last_coeff_abs_level_remaining,
++ const unsigned int c_rice_param)
++{
++ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
++ if (x >= 6)
++ (*stat_coeff)++;
++ else if (x == 0 && *stat_coeff > 0)
++ (*stat_coeff)--;
++}
++#endif
++
++
++// n must be > 0 on entry
++#ifndef get_cabac_sig_coeff_flag_idxs
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++ unsigned int n,
++ const uint8_t const * ctx_map,
++ uint8_t * p)
++{
++ do {
++ if (get_cabac(c, state0 + ctx_map[n]))
++ *p++ = n;
++ } while (--n != 0);
++ return p;
++}
++#endif
++
++
++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++ unsigned int n,
++ const uint8_t * ctx_map, // const ptr here but not in asm
++ uint8_t * const flag_idx)
++{
++ int rv;
++
++ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
++
++ return rv;
++}
++
++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++ x0, x1, x2, x3,\
++ x4, x5, x6, x7,\
++ x8, x9, x10, x11,\
++ x12, x13, x14, x15}
++
++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++ x0, x4, x8, x12,\
++ x1, x5, x9, x13,\
++ x2, x6, x10, x14,\
++ x3, x7, x11, x15}
++
++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++ x0, x4, x1, x8,\
++ x5, x2, x12, x9,\
++ x6, x3, x13, x10,\
++ x7, x14, x11, x15}
++
++
++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
++ uint8_t * const significant_coeff_group_flag,
++ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
++ int * const pPrev_sig)
++{
++ while (--i >= 0) {
++ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
++ const unsigned int x_cg = scan_x_cg[i];
++
++ // For the flag decode we only care about Z/NZ but
++ // we use the full Right * 2 + Down when calculating
++ // significant coeff flags so we obtain it here.
++ //
++ // The group flag array is one longer than it needs to
++ // be so we don't need to check for y_cg limits
++ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
++
++ if (i == 0 ||
++ significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
++ {
++ gf_y[0] |= (1 << x_cg);
++ *pPrev_sig = prev_sig;
++ break;
++ }
++ }
++
++ return i;
++}
++
++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++ const unsigned int log2_trafo_size, const unsigned int c_idx,
++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++ const AVFrame * const frame = s->frame;
++ const unsigned int stride = frame_stride1(s->frame, c_idx);
++ const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++ const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++ const int is_sliced = 1; // av_rpi_is_sand_frame(frame);
++ uint8_t * const dst = !is_sliced ?
++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(frame, x, y) :
++ av_rpi_sand_frame_pos_c(frame, x, y);
++
++ const unsigned int i = jb->intra.n;
++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++ pc->ta.dst == dst)
++ {
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->ta.stride == stride);
++
++ pc->type = RPI_PRED_ADD_RESIDUAL_C;
++ }
++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++ pc->dc.dst == dst)
++ {
++ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->dc.stride == stride);
++
++ // Rewrite as add residual - must rewrite all fields as different union member
++ pc->type = RPI_PRED_ADD_RESIDUAL_V;
++ pc->ta.buf = coeffs;
++ pc->ta.dst = dst;
++ pc->ta.stride = stride;
++ pc->ta.dc = dc;
++ }
++ else
++ {
++ HEVCPredCmd * const cmd = pc + 1;
++ jb->intra.n = i + 1;
++
++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++ cmd->size = log2_trafo_size;
++ cmd->ta.buf = coeffs;
++ cmd->ta.dst = dst;
++ cmd->ta.stride = stride;
++ cmd->ta.dc = 0;
++ }
++}
++
++
++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const unsigned int log2_trafo_size, const unsigned int c_idx,
++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++ const AVFrame * const frame = s->frame;
++ const unsigned int stride = frame_stride1(s->frame, c_idx);
++ const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++ const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++ const int is_sliced = 1;
++ uint8_t * const dst = !is_sliced ?
++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(frame, x, y) :
++ av_rpi_sand_frame_pos_c(frame, x, y);
++
++ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
++ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
++
++ const unsigned int i = jb->intra.n;
++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++ pc->ta.dst == dst)
++ {
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->ta.stride == stride);
++
++ pc->ta.dc = (int16_t)coeff;
++ }
++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++ pc->dc.dst == dst)
++ {
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->dc.stride == stride &&
++ (pc->dc.dc & ~0xffff) == 0);
++
++ pc->dc.dc |= (coeff << 16);
++ }
++ else
++ {
++ HEVCPredCmd * const cmd = pc + 1;
++ jb->intra.n = i + 1;
++
++ cmd->type = RPI_PRED_ADD_DC + c_idx;
++ cmd->size = log2_trafo_size;
++ cmd->dc.dst = dst;
++ cmd->dc.stride = stride;
++ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
++ }
++}
++
++
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0,
++ const int log2_trafo_size, const enum ScanType scan_idx,
++ const int c_idx)
++{
++ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
++
++ int last_significant_coeff_x, last_significant_coeff_y;
++ int num_coeff = 0;
++ int prev_subset_coded = 0;
++
++ int num_last_subset;
++ int x_cg_last_sig, y_cg_last_sig;
++
++ const uint8_t *scan_x_cg, *scan_y_cg;
++ const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++
++ int use_vpu;
++#if RPI_COMPRESS_COEFFS
++ int num_nonzero = 0;
++ int use_compress = 0;
++ int *coeffs32;
++#endif
++ int use_dc = 0;
++ int16_t *coeffs;
++ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero
++ int explicit_rdpcm_flag = 0;
++ int explicit_rdpcm_dir_flag;
++
++ int i;
++ int shift,scale;
++ const uint8_t *scale_matrix = NULL;
++ uint8_t dc_scale;
++ const int c_idx_nz = (c_idx != 0);
++ const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++ int prev_sig = 0;
++ int may_hide_sign;
++
++ int16_t dummy_coeffs[16];
++
++ // Derive QP for dequant
++ if (!lc->cu.cu_transquant_bypass_flag) {
++ may_hide_sign = s->ps.pps->sign_data_hiding_flag;
++
++ if (s->ps.pps->transform_skip_enabled_flag &&
++ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++ int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
++ if (transform_skip_flag) {
++ trans_skip_or_bypass = 1;
++ if (lc->cu.pred_mode == MODE_INTRA &&
++ s->ps.sps->implicit_rdpcm_enabled_flag &&
++ (pred_mode_intra == 10 || pred_mode_intra == 26)) {
++ may_hide_sign = 0;
++ }
++ }
++ }
++
++ {
++ static const uint8_t level_scale[8] = {
++ 40, 45, 51, 57, 64, 72, 0, 0 // Pad to 8
++ };
++ const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
++
++ // Shift is set to one less than will actually occur as the scale
++ // and saturate step adds 1 and then shifts right again
++ scale = level_scale[qp6 & 7];
++// shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
++ shift = log2_trafo_size - (qp6 >> 3);
++
++ if (shift < 0) {
++ scale <<= -shift;
++ shift = 0;
++ }
++ }
++
++ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++ const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
++ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++ const unsigned int matrix_id =
++ lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
++
++ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
++ dc_scale = scale_matrix[0];
++ if (log2_trafo_size >= 4)
++ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++ }
++ else
++ {
++ static const uint8_t sixteen_scale[64] = {
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16
++ };
++ scale_matrix = sixteen_scale;
++ dc_scale = 16;
++ }
++ } else {
++ static const uint8_t unit_scale[64] = {
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ };
++ scale_matrix = unit_scale;
++ shift = 0;
++ scale = 2; // We will shift right to kill this
++ dc_scale = 1;
++
++ may_hide_sign = 0;
++ }
++
++
++
++
++ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++ trans_skip_or_bypass) {
++ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
++ if (explicit_rdpcm_flag) {
++ may_hide_sign = 0;
++ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
++ }
++ }
++
++ last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
++ &last_significant_coeff_x, &last_significant_coeff_y);
++
++ if (last_significant_coeff_x > 3) {
++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
++ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
++ (2 + (last_significant_coeff_x & 1)) +
++ suffix;
++ }
++
++ if (last_significant_coeff_y > 3) {
++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
++ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
++ (2 + (last_significant_coeff_y & 1)) +
++ suffix;
++ }
++
++ if (scan_idx == SCAN_VERT)
++ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
++
++ x_cg_last_sig = last_significant_coeff_x >> 2;
++ y_cg_last_sig = last_significant_coeff_y >> 2;
++
++ switch (scan_idx) {
++ case SCAN_DIAG: {
++ int last_x_c = last_significant_coeff_x & 3;
++ int last_y_c = last_significant_coeff_y & 3;
++
++ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++
++ switch (log2_trafo_size) {
++ case 2:
++ scan_x_cg = scan_1x1;
++ scan_y_cg = scan_1x1;
++ break;
++ case 3:
++ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++ scan_x_cg = diag_scan2x2_x;
++ scan_y_cg = diag_scan2x2_y;
++ break;
++ case 4:
++ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++ scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
++ scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
++ break;
++ case 5:
++ default:
++ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++ scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
++ scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
++ break;
++ }
++ break;
++ }
++ case SCAN_HORIZ:
++ scan_x_cg = horiz_scan2x2_x;
++ scan_y_cg = horiz_scan2x2_y;
++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++ break;
++ default: //SCAN_VERT
++ scan_x_cg = horiz_scan2x2_y;
++ scan_y_cg = horiz_scan2x2_x;
++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++ break;
++ }
++ num_coeff++;
++ num_last_subset = (num_coeff - 1) >> 4;
++
++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++
++ {
++ const unsigned int ccount = 1 << (log2_trafo_size * 2);
++ const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */; // These need special processing
++ use_vpu = 0;
++ use_dc = (num_coeff == 1) && !special &&
++ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
++
++ if (use_dc) {
++ // Just need a little empty space
++ coeffs = dummy_coeffs;
++ // No need to clear
++ }
++ else
++ {
++ use_vpu = !special && log2_trafo_size >= 4;
++#if RPI_COMPRESS_COEFFS
++ use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
++#endif
++ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if RPI_COMPRESS_COEFFS
++ coeffs32 = (int*)coeffs;
++ if (!use_compress)
++#endif
++#if HAVE_NEON
++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++ memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++ }
++ }
++
++ i = num_last_subset;
++ do {
++ int implicit_non_zero_coeff = 0;
++ int n_end;
++
++ uint8_t significant_coeff_flag_idx[16];
++ unsigned int nb_significant_coeff_flag = 0;
++
++ if (i == num_last_subset) {
++ // First time through
++ int last_scan_pos = num_coeff - (i << 4) - 1;
++ n_end = last_scan_pos - 1;
++ significant_coeff_flag_idx[0] = last_scan_pos;
++ nb_significant_coeff_flag = 1;
++ } else {
++ n_end = 15;
++ implicit_non_zero_coeff = (i != 0);
++ }
++
++ if (n_end >= 0) {
++ static const uint8_t ctx_idx_maps_ts2[3][16] = {
++ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2
++ };
++ // N.B. prev_sig = Right * 2 + Down
++ static const uint8_t ctx_idx_maps[3][4][16] = {
++ {
++ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
++ },
++ {
++ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
++ },
++ {
++ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
++ }
++ };
++ const uint8_t *ctx_idx_map_p;
++ int scf_offset = 0;
++
++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++ ctx_idx_map_p = ctx_idx_maps[0][3];
++ scf_offset = 40 + c_idx_nz;
++ } else {
++ if (c_idx_nz != 0)
++ scf_offset = 27;
++
++ if (log2_trafo_size == 2) {
++ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++ } else {
++ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
++ if (!c_idx_nz) {
++ if (i != 0)
++ scf_offset += 3;
++
++ if (log2_trafo_size == 3) {
++ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++ } else {
++ scf_offset += 21;
++ }
++ } else {
++ if (log2_trafo_size == 3)
++ scf_offset += 9;
++ else
++ scf_offset += 12;
++ }
++ }
++ }
++
++ if (n_end > 0) {
++ int cnt = get_sig_coeff_flag_idxs(&lc->cc,
++ lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
++ n_end, ctx_idx_map_p,
++ significant_coeff_flag_idx + nb_significant_coeff_flag);
++
++ nb_significant_coeff_flag += cnt;
++ if (cnt != 0) {
++ implicit_non_zero_coeff = 0;
++ }
++ }
++
++ if (implicit_non_zero_coeff == 0) {
++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++ scf_offset = 42 + c_idx_nz;
++ } else {
++ if (i == 0) {
++ scf_offset = c_idx_nz ? 27 : 0;
++ } else {
++ scf_offset = 2 + scf_offset;
++ }
++ }
++ if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++ nb_significant_coeff_flag++;
++ }
++ } else {
++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++ nb_significant_coeff_flag++;
++ }
++ }
++#if RPI_COMPRESS_COEFFS
++ if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
++ int16_t temp[32*32];
++ const unsigned int ccount = 1 << (log2_trafo_size * 2);
++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
++ memcpy(temp, coeffs, sizeof(int)*num_nonzero);
++ coeffs32 = (int *)temp;
++ memset(coeffs, 0, ccount * sizeof(int16_t));
++ num_nonzero--;
++ while (num_nonzero >= 0) {
++ const unsigned int res = coeffs32[num_nonzero];
++ const unsigned int offset = res & 0xffff;
++ coeffs[ offset ] = res >> 16;
++ num_nonzero--;
++ }
++ use_compress = 0;
++ }
++#endif
++
++ if (nb_significant_coeff_flag != 0) {
++ const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
++ ((i != 0 && !c_idx_nz) ? 2 : 0) |
++ prev_subset_coded;
++ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
++ (gt1_idx_delta << 2);
++ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
++ gt1_idx_delta;
++
++ const unsigned int x_cg = scan_x_cg[i];
++ const unsigned int y_cg = scan_y_cg[i];
++ int16_t * const blk_coeffs = coeffs +
++ ((x_cg + (y_cg << log2_trafo_size)) << 2);
++ // This calculation is 'wrong' for log2_traffo_size == 2
++ // but that doesn't matter as in this case x_cg & y_cg
++ // are always 0 so result is correct (0) anyway
++ const uint8_t * const blk_scale = scale_matrix +
++ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
++
++ // * The following code block doesn't deal with these flags:
++ // (nor did the one it replaces)
++ //
++ // cabac_bypass_alignment_enabled_flag
++ // This should be easy but I can't find a test case
++ // extended_precision_processing_flag
++ // This can extend the required precision past 16bits
++ // so is probably tricky - also no example found yet
++
++#if USE_N_END_1
++ if (nb_significant_coeff_flag == 1) {
++ // There is a small gain to be had from special casing the single
++ // transform coefficient case. The reduction in complexity
++ // makes up for the code duplicatioon.
++
++ int trans_coeff_level = 1;
++ int coeff_sign_flag;
++ int coded_val = 0;
++
++ // initialize first elem of coeff_bas_level_greater1_flag
++ prev_subset_coded = 0;
++
++ if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
++ trans_coeff_level = 2;
++ prev_subset_coded = 1;
++ coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
++ }
++
++ // Probably not worth the overhead of starting by22 for just one value
++ coeff_sign_flag = get_cabac_bypass(&lc->cc);
++
++ if (coded_val)
++ {
++ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
++ } else {
++ uint8_t * const stat_coeff =
++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++ const unsigned int c_rice_param = *stat_coeff >> 2;
++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
++
++ trans_coeff_level = 3 + last_coeff_abs_level_remaining;
++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++ }
++ }
++
++ {
++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++ const unsigned int scale_m = blk_scale[xy_off->scale];
++ const int res = trans_scale_sat(
++ (trans_coeff_level ^ k) - k, // Apply sign
++ scale,
++ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
++ shift);
++#if RPI_COMPRESS_COEFFS
++ if (use_compress)
++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++ else
++#endif
++ blk_coeffs[xy_off->coeff] = res;
++ }
++ }
++ else
++#endif
++ {
++ int sign_hidden = may_hide_sign;
++ int levels[16]; // Should be able to get away with int16_t but that fails some tests
++ uint32_t coeff_sign_flags;
++ uint32_t coded_vals = 0;
++ // Sum(abs(level[]))
++ // In fact we only need the bottom bit and in some future
++ // version that may be all we calculate
++ unsigned int sum_abs;
++
++ coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
++ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
++
++ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
++ sign_hidden = 0;
++
++ // -- Start bypass block
++
++ bypass_start(&lc->cc);
++
++ coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
++
++ if (coded_vals != 0)
++ {
++ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
++ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
++ int * level = levels - 1;
++
++ do {
++ {
++ const unsigned int z = hevc_clz32(coded_vals) + 1;
++ level += z;
++ coded_vals <<= z;
++ }
++
++ {
++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
++ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
++
++ sum_abs += last_coeff_abs_level_remaining + 1;
++ *level = trans_coeff_level;
++
++ if (stat_coeff != NULL)
++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++ stat_coeff = NULL;
++
++ if (trans_coeff_level > (3 << c_rice_param) &&
++ (c_rice_param < 4 || rice_adaptation_enabled))
++ ++c_rice_param;
++ }
++ } while (coded_vals != 0);
++ }
++
++ // sign_hidden = 0 or 1 so we can combine the tests
++ if ((sign_hidden & sum_abs) != 0) {
++ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++ }
++
++ bypass_finish(&lc->cc);
++
++ // -- Finish bypass block
++
++ // Scale loop
++ {
++ int m = nb_significant_coeff_flag - 1;
++
++ // Deal with DC component (if any) first
++ if (i == 0 && significant_coeff_flag_idx[m] == 0)
++ {
++ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++ const int res = trans_scale_sat(
++ (levels[m] ^ k) - k, scale, dc_scale, shift);
++#if RPI_COMPRESS_COEFFS
++ if (use_compress)
++ {
++ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
++ }
++ else
++#endif
++ {
++ blk_coeffs[0] = res;
++ }
++ --m;
++ }
++
++#if !USE_N_END_1
++ // If N_END_1 set then m was at least 1 initially
++ if (m >= 0)
++#endif
++ {
++ do {
++ const xy_off_t * const xy_off = scan_xy_off +
++ significant_coeff_flag_idx[m];
++ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++ const int res = trans_scale_sat(
++ (levels[m] ^ k) - k,
++ scale,
++ blk_scale[xy_off->scale],
++ shift);
++#if RPI_COMPRESS_COEFFS
++ if (use_compress) {
++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++ } else
++#endif
++ blk_coeffs[xy_off->coeff] = res;
++ } while (--m >= 0);
++ }
++ }
++
++ }
++ }
++ } while ((i = next_subset(lc, i, c_idx_nz,
++ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
++ !cabac_overflow(&lc->cc));
++
++ if (lc->cu.cu_transquant_bypass_flag) {
++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
++
++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++ }
++ } else {
++ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++ int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++ log2_trafo_size == 2 &&
++ lc->cu.pred_mode == MODE_INTRA;
++ if (rot) {
++ for (i = 0; i < 8; i++)
++ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
++ }
++
++ s->hevcdsp.dequant(coeffs, log2_trafo_size);
++
++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++ lc->cu.pred_mode == MODE_INTRA &&
++ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
++
++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++ }
++ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++ s->hevcdsp.transform_4x4_luma(coeffs);
++ }
++ else if (!use_vpu)
++ {
++ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++ if (max_xy == 0)
++ {
++ if (use_dc)
++ rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++ else
++ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++ }
++ else {
++ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++ if (max_xy < 4)
++ col_limit = FFMIN(4, col_limit);
++ else if (max_xy < 8)
++ col_limit = FFMIN(8, col_limit);
++ else if (max_xy < 12)
++ col_limit = FFMIN(24, col_limit);
++ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
++ }
++ }
++ }
++
++#if 0
++ // Mildly rotted - we support no mode where cross is valid
++ if (lc->tu.cross_pf) {
++ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
++ const int ccount = 1 << (log2_trafo_size * 2);
++
++ for (i = 0; i < ccount; i++) {
++ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++ }
++ }
++#endif
++
++ if (!use_dc) {
++#if RPI_COMPRESS_COEFFS
++ if (use_compress) {
++ coeffs32[num_nonzero] = 0;
++ }
++#endif
++ rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++ }
++}
++
++#if !USE_BY22
++// Stores results to lc
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++ int x = abs_mvd_greater0_flag_decode(lc);
++ int y = abs_mvd_greater0_flag_decode(lc);
++
++ if (x)
++ x += abs_mvd_greater1_flag_decode(lc);
++ if (y)
++ y += abs_mvd_greater1_flag_decode(lc);
++
++ switch (x) {
++ case 2: x = mvd_decode(lc); break;
++ case 1: x = mvd_sign_flag_decode(lc); break;
++ case 0: x = 0; break;
++ }
++
++ switch (y) {
++ case 2: y = mvd_decode(lc); break;
++ case 1: y = mvd_sign_flag_decode(lc); break;
++ case 0: y = 0; break;
++ }
++ return MV_XY(x,y);
++}
++#else
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++ int x = abs_mvd_greater0_flag_decode(lc);
++ int y = abs_mvd_greater0_flag_decode(lc);
++
++ if ((x | y) == 0)
++ return 0;
++
++ if (x != 0)
++ x += abs_mvd_greater1_flag_decode(lc);
++ if (y != 0)
++ y += abs_mvd_greater1_flag_decode(lc);
++
++ if ((x | y) == 1)
++ {
++ // Not worth starting BY22
++ if (x != 0)
++ x = mvd_sign_flag_decode(lc);
++ if (y != 0)
++ y = mvd_sign_flag_decode(lc);
++ }
++ else
++ {
++ CABACContext * const cc = &lc->cc;
++ uint32_t val;
++ uint32_t b;
++ unsigned int n = 0;
++
++ bypass_start(cc);
++ b = val = get_cabac_by22_peek(cc);
++
++ if (x == 1) {
++ x = ((int32_t)b >> 31) | 1;
++ n = 1;
++ b <<= 1;
++ }
++ else if (x == 2) {
++ // EG1 so we have (leading one bits + 1) of suffix
++ // This makes prefix & suffix lengths the same
++ const unsigned int k = hevc_clz32(~b) + 1;
++ int s;
++
++ av_assert2(k <= 15);
++
++ b <<= k;
++ n = 2 * k + 1; // Includes suffix & sign
++
++ // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
++ // if we are going to do this without a flush
++ if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
++ {
++ // Need too many bits - flush
++ // n = k
++ get_cabac_by22_flush(cc, k, val);
++ b = val = get_cabac_by22_peek(cc);
++ n = k + 1;
++ }
++
++ x = (b >> (32 - k)) + (1 << k);
++ b <<= k;
++ s = (int32_t)b >> 31;
++ x = (x ^ s) - s;
++ b <<= 1;
++
++ // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
++ if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
++ {
++ get_cabac_by22_flush(cc, n, val);
++ b = val = get_cabac_by22_peek(cc);
++ n = 0;
++ }
++ }
++
++ if (y == 1) {
++ y = ((int32_t)b >> 31) | 1;
++ ++n;
++ // don't care about b anymore
++ }
++ else if (y == 2) {
++ const unsigned int k = hevc_clz32(~b) + 1;
++ int s;
++
++ av_assert2(k <= 15);
++
++ // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
++ // if we are going to do this without a flush
++ b <<= k;
++ n += 2 * k + 1;
++
++ if (n > CABAC_BY22_PEEK_BITS)
++ {
++ // Need too many bits - flush
++ get_cabac_by22_flush(cc, n - (k + 1), val);
++ b = val = get_cabac_by22_peek(cc);
++ n = k + 1;
++ }
++
++ y = (b >> (32 - k)) + (1 << k);
++ s = (int32_t)(b << k) >> 31;
++ y = (y ^ s) - s;
++ // don't care about b anymore
++ }
++
++ get_cabac_by22_flush(cc, n, val);
++ bypass_finish(cc);
++ }
++
++ return MV_XY(x, y);
++}
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac_fns.h
+@@ -0,0 +1,217 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
++#define AVCODEC_RPI_HEVC_CABAC_FNS_H
++
++#include "config.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
++
++//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0,
++ const int log2_trafo_size, const enum ScanType scan_idx,
++ const int c_idx);
++
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
++
++#define HEVC_BIN_SAO_MERGE_FLAG 0
++#define HEVC_BIN_SAO_TYPE_IDX 1
++#define HEVC_BIN_SAO_EO_CLASS 2
++#define HEVC_BIN_SAO_BAND_POSITION 2
++#define HEVC_BIN_SAO_OFFSET_ABS 2
++#define HEVC_BIN_SAO_OFFSET_SIGN 2
++#define HEVC_BIN_END_OF_SLICE_FLAG 2
++#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG 2
++#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG 5
++#define HEVC_BIN_SKIP_FLAG 6
++#define HEVC_BIN_CU_QP_DELTA 9
++#define HEVC_BIN_PRED_MODE 12
++#define HEVC_BIN_PART_MODE 13
++#define HEVC_BIN_PCM_FLAG 17
++#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE 17
++#define HEVC_BIN_MPM_IDX 18
++#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE 18
++#define HEVC_BIN_INTRA_CHROMA_PRED_MODE 18
++#define HEVC_BIN_MERGE_FLAG 20
++#define HEVC_BIN_MERGE_IDX 21
++#define HEVC_BIN_INTER_PRED_IDC 22
++#define HEVC_BIN_REF_IDX_L0 27
++#define HEVC_BIN_REF_IDX_L1 29
++#define HEVC_BIN_ABS_MVD_GREATER0_FLAG 31
++#define HEVC_BIN_ABS_MVD_GREATER1_FLAG 33
++#define HEVC_BIN_ABS_MVD_MINUS2 35
++#define HEVC_BIN_MVD_SIGN_FLAG 35
++#define HEVC_BIN_MVP_LX_FLAG 35
++#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG 36
++#define HEVC_BIN_SPLIT_TRANSFORM_FLAG 37
++#define HEVC_BIN_CBF_LUMA 40
++#define HEVC_BIN_CBF_CB_CR 42
++#define HEVC_BIN_TRANSFORM_SKIP_FLAG 46
++#define HEVC_BIN_EXPLICIT_RDPCM_FLAG 48
++#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG 50
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX 52
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX 70
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX 88
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX 88
++#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG 88
++#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG 92
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG 136
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG 160
++#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING 166
++#define HEVC_BIN_COEFF_SIGN_FLAG 166
++#define HEVC_BIN_LOG2_RES_SCALE_ABS 166
++#define HEVC_BIN_RES_SCALE_SIGN_FLAG 174
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG 176
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX 177
++
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
++
++static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
++ const uint8_t *ptr = c->bytestream;
++
++ if (c->low & 0x1)
++ ptr--;
++#if CABAC_BITS == 16
++ if (c->low & 0x1FF)
++ ptr--;
++#endif
++ if ((int) (c->bytestream_end - ptr) < n)
++ return NULL;
++ if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
++ return NULL;
++
++ return ptr;
++}
++
++static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
++}
++
++static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int ct_depth,
++ const unsigned int x0, const unsigned int y0)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
++ ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
++ ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
++}
++
++static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0, const int x_cb, const int y_cb)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
++ (s->cabac_stash_left[y0 >> 3] & 1) +
++ (s->cabac_stash_up[x0 >> 3] & 1));
++}
++
++static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++}
++
++static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
++}
++
++static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
++}
++
++static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
++}
++
++static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
++}
++
++static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
++}
++
++static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
++}
++
++
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.c
+@@ -0,0 +1,75 @@
++/*
++ * HEVC shared tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "rpi_hevc_data.h"
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
++ 0, 0, 1, 0,
++ 1, 2, 0, 1,
++ 2, 3, 1, 2,
++ 3, 2, 3, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
++ 0, 1, 0, 2,
++ 1, 0, 3, 2,
++ 1, 0, 3, 2,
++ 1, 3, 2, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
++ 0, 0, 1, 0,
++ 1, 2, 0, 1,
++ 2, 3, 0, 1,
++ 2, 3, 4, 0,
++ 1, 2, 3, 4,
++ 5, 0, 1, 2,
++ 3, 4, 5, 6,
++ 0, 1, 2, 3,
++ 4, 5, 6, 7,
++ 1, 2, 3, 4,
++ 5, 6, 7, 2,
++ 3, 4, 5, 6,
++ 7, 3, 4, 5,
++ 6, 7, 4, 5,
++ 6, 7, 5, 6,
++ 7, 6, 7, 7,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
++ 0, 1, 0, 2,
++ 1, 0, 3, 2,
++ 1, 0, 4, 3,
++ 2, 1, 0, 5,
++ 4, 3, 2, 1,
++ 0, 6, 5, 4,
++ 3, 2, 1, 0,
++ 7, 6, 5, 4,
++ 3, 2, 1, 0,
++ 7, 6, 5, 4,
++ 3, 2, 1, 7,
++ 6, 5, 4, 3,
++ 2, 7, 6, 5,
++ 4, 3, 7, 6,
++ 5, 4, 7, 6,
++ 5, 7, 6, 7,
++};
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.h
+@@ -0,0 +1,31 @@
++/*
++ * HEVC shared data tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_DATA_H
++#define AVCODEC_RPI_HEVC_DATA_H
++
++#include <stdint.h>
++
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
++
++#endif /* AVCODEC_RPI_HEVC_DATA_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_filter.c
+@@ -0,0 +1,1210 @@
++/*
++ * HEVC video decoder
++ *
++ * Originally by:
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Seppo Tomperi
++ * Copyright (C) 2013 Wassim Hamidouche
++ *
++ * Substantially rewritten:
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++//#define DISABLE_SAO
++//#define DISABLE_DEBLOCK
++//#define DISABLE_STRENGTHS
++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
++//#define DISABLE_DEBLOCK_NONREF
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++
++#include "rpi_qpu.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#define LUMA 0
++#define CB 1
++#define CR 2
++
++// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
++// so -12,75 overall
++static const uint8_t tctablex[] = {
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18
++ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37
++ 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24, // QP 38...53
++ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 // 54..75
++};
++#define tctable (tctablex + 12 + 6*8)
++
++static const uint8_t betatablex[] = {
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18
++ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
++ 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // QP 38...51
++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 // 52..73
++};
++#define betatable (betatablex + 12 + 6*8)
++
++static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
++ const int c_idx, const int tc_offset)
++{
++ return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
++}
++
++static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int xBase, const unsigned int yBase)
++{
++ const unsigned int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1;
++ const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
++ const unsigned int xQgBase = xBase & MinCuQpDeltaSizeMask;
++ const unsigned int yQgBase = yBase & MinCuQpDeltaSizeMask;
++ const unsigned int min_cb_width = s->ps.sps->min_cb_width;
++ const unsigned int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size;
++ const unsigned int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size;
++ const int qPy_pred = lc->qPy_pred;
++
++ return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
++ s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
++ ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
++ s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
++}
++
++// * Only called from bitstream decode in foreground
++// so should be safe
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
++{
++ const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
++
++ if (lc->tu.cu_qp_delta != 0) {
++ // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
++ int off = s->ps.sps->qp_bd_offset;
++ lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
++ 52 + off) - off;
++ } else
++ lc->qp_y = qp_y;
++}
++
++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
++{
++ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
++}
++
++// "DSP" these?
++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
++{
++ switch (pixel_shift)
++ {
++ case 2:
++ *(uint32_t *)dst = *(uint32_t *)src;
++ break;
++ case 1:
++ *(uint16_t *)dst = *(uint16_t *)src;
++ break;
++ default:
++ *dst = *src;
++ break;
++ }
++}
++
++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
++ ptrdiff_t stride_src, int x, int y, int width, int height,
++ int c_idx, int x_ctb, int y_ctb)
++{
++ const unsigned int sh = pixel_shift(s, c_idx);
++ const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
++ const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
++
++ /* copy horizontal edges */
++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
++ src, width << sh);
++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
++ src + stride_src * (height - 1), width << sh);
++
++ /* copy vertical edges */
++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
++
++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
++}
++
++// N.B. Src & dst are swapped as this is a restore!
++// x0 & y0 are in luma coords
++// Width & height are in Y/C pels as appropriate
++// * Clear scope for optimsation here but not used enough to be worth it
++static void restore_tqb_pixels(const HEVCRpiContext * const s,
++ uint8_t *src1, const uint8_t *dst1,
++ const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int width, const int height,
++ const int c_idx)
++{
++ if (s->ps.pps->transquant_bypass_enable_flag ||
++ s->ps.sps->pcm.loop_filter_disable_flag)
++ {
++ const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
++ int blks_y = height >> (c_idx == 0 ? 3 : 2);
++ const unsigned int bwidth = 8 << s->ps.sps->pixel_shift; // Y & C have the same width in sand
++ const unsigned int bheight = (c_idx == 0) ? 8 : 4;
++ const unsigned int sh = ((x0 >> 3) & 7);
++ const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
++
++ do {
++ unsigned int m = (*pcm >> sh) & mask;
++ uint8_t * bd = src1;
++ const uint8_t * bs = dst1;
++ while (m != 0) {
++ if ((m & 1) != 0) {
++ s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
++ }
++ m >>= 1;
++ bs += bwidth;
++ bd += bwidth;
++ }
++ src1 += stride_src * bheight;
++ dst1 += stride_dst * bheight;
++ pcm += s->ps.sps->pcm_width;
++ } while (--blks_y > 0);
++ }
++}
++
++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
++
++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
++{
++#if SAO_FILTER_N == 5
++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#elif SAO_FILTER_N == 6
++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#else
++#error Confused by size of sao fn array
++#endif
++ int c_idx;
++ int edges[4]; // 0 left 1 top 2 right 3 bottom
++ int x_ctb = x >> s->ps.sps->log2_ctb_size;
++ int y_ctb = y >> s->ps.sps->log2_ctb_size;
++ int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb;
++ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
++ RpiSAOParams *sao = &CTB(s->sao, x_ctb, y_ctb);
++ // flags indicating unfilterable edges
++ uint8_t vert_edge[] = { 0, 0 };
++ uint8_t horiz_edge[] = { 0, 0 };
++ uint8_t diag_edge[] = { 0, 0, 0, 0 };
++ uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb);
++ uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag &&
++ !s->ps.pps->loop_filter_across_tiles_enabled_flag;
++ uint8_t restore = no_tile_filter || !lfase;
++ uint8_t left_tile_edge = 0;
++ uint8_t right_tile_edge = 0;
++ uint8_t up_tile_edge = 0;
++ uint8_t bottom_tile_edge = 0;
++ const int sliced = 1;
++ const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
++
++ edges[0] = x_ctb == 0;
++ edges[1] = y_ctb == 0;
++ edges[2] = x_ctb == s->ps.sps->ctb_width - 1;
++ edges[3] = y_ctb == s->ps.sps->ctb_height - 1;
++
++#ifdef DISABLE_SAO
++ return;
++#endif
++
++ if (restore) {
++ if (!edges[0]) {
++ left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++ vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
++ }
++ if (!edges[2]) {
++ right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
++ vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
++ }
++ if (!edges[1]) {
++ up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
++ horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
++ }
++ if (!edges[3]) {
++ bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
++ horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
++ }
++ if (!edges[0] && !edges[1]) {
++ diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
++ }
++ if (!edges[1] && !edges[2]) {
++ diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
++ }
++ if (!edges[2] && !edges[3]) {
++ diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
++ }
++ if (!edges[0] && !edges[3]) {
++ diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
++ }
++ }
++
++ for (c_idx = 0; c_idx < plane_count; c_idx++) {
++ const unsigned int vshift = ctx_vshift(s, c_idx);
++ const unsigned int hshift = ctx_hshift(s, c_idx);
++ const int x0 = x >> hshift;
++ const int y0 = y >> vshift;
++ const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
++ const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
++ const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
++ const int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0);
++ const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
++ int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++ ptrdiff_t stride_dst;
++ uint8_t *dst;
++
++ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++ uint8_t * const src = !sliced ?
++ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
++ av_rpi_sand_frame_pos_c(s->frame, x0, y0);
++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
++ !sliced ? src - (1 << sh) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
++ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
++ !sliced ? src + (width << sh) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
++ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
++
++ if (sliced && c_idx > 1) {
++ break;
++ }
++
++// if (c_idx == 1)
++// printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
++
++ switch (sao->type_idx[c_idx]) {
++ case SAO_BAND:
++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++ x_ctb, y_ctb);
++ if (s->ps.pps->transquant_bypass_enable_flag ||
++ s->ps.sps->pcm.loop_filter_disable_flag)
++ {
++ // Can't use the edge buffer here as it may be in use by the foreground
++ DECLARE_ALIGNED(64, uint8_t, dstbuf)
++ [2*MAX_PB_SIZE*MAX_PB_SIZE];
++ dst = dstbuf;
++ stride_dst = 2*MAX_PB_SIZE;
++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++ if (sliced && c_idx != 0)
++ {
++ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
++ sao->offset_val[1], sao->band_position[1],
++ sao->offset_val[2], sao->band_position[2],
++ width, height);
++ }
++ else
++ {
++ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++ sao->offset_val[c_idx], sao->band_position[c_idx],
++ width, height);
++ }
++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++ x, y, width, height, c_idx);
++ } else {
++ if (sliced && c_idx != 0)
++ {
++ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
++ sao->offset_val[1], sao->band_position[1],
++ sao->offset_val[2], sao->band_position[2],
++ width, height);
++ }
++ else
++ {
++ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++ sao->offset_val[c_idx], sao->band_position[c_idx],
++ width, height);
++ }
++ }
++ sao->type_idx[c_idx] = SAO_APPLIED;
++ break;
++ case SAO_EDGE:
++ {
++ const int w = s->ps.sps->width >> hshift;
++ const int h = s->ps.sps->height >> vshift;
++ int top_edge = edges[1];
++ int bottom_edge = edges[3];
++ // Can't use the edge buffer here as it may be in use by the foreground
++ DECLARE_ALIGNED(64, uint8_t, dstbuf)
++ [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
++
++ stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
++ dst = dstbuf + stride_dst + 32;
++
++ if (!top_edge) {
++ uint8_t *dst1;
++ int src_idx;
++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
++
++ dst1 = dst - stride_dst;
++
++ if (src_l != NULL) {
++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++ }
++
++ src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
++
++ if (src_r != NULL) {
++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++ }
++ }
++ if (!bottom_edge) {
++ uint8_t * const dst1 = dst + height * stride_dst;
++ int src_idx;
++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
++ const unsigned int hoff = height * stride_src;
++
++ if (src_l != NULL) {
++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++ }
++
++ src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
++
++ if (src_r != NULL) {
++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++ }
++ }
++ if (src_l != NULL) {
++ if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++ ff_hevc_rpi_copy_vert(dst - (1 << sh),
++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++ sh, height, stride_dst, 1 << sh);
++ } else {
++ ff_hevc_rpi_copy_vert(dst - (1 << sh),
++ src_l,
++ sh, height, stride_dst, stride_src);
++ }
++ }
++ if (src_r != NULL) {
++ if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++ ff_hevc_rpi_copy_vert(dst + (width << sh),
++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++ sh, height, stride_dst, 1 << sh);
++ } else {
++ ff_hevc_rpi_copy_vert(dst + (width << sh),
++ src_r,
++ sh, height, stride_dst, stride_src);
++ }
++ }
++
++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++
++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++ x_ctb, y_ctb);
++ if (sliced && c_idx != 0)
++ {
++ // Class always the same for both U & V (which is just as well :-))
++ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
++ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
++ width, height);
++ s->hevcdsp.sao_edge_restore_c[restore](src, dst,
++ stride_src, stride_dst,
++ sao,
++ edges, width,
++ height, c_idx,
++ vert_edge,
++ horiz_edge,
++ diag_edge);
++ }
++ else
++ {
++ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++ sao->eo_class[c_idx], width, height);
++ s->hevcdsp.sao_edge_restore[restore](src, dst,
++ stride_src, stride_dst,
++ sao,
++ edges, width,
++ height, c_idx,
++ vert_edge,
++ horiz_edge,
++ diag_edge);
++ }
++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++ x, y, width, height, c_idx);
++ sao->type_idx[c_idx] = SAO_APPLIED;
++ break;
++ }
++ }
++ }
++
++#if RPI_ZC_SAND_8_IN_10_BUF
++ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
++ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
++ {
++ const unsigned int stride1 = frame_stride1(s->frame, 1);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
++ const unsigned int xoff = (x >> 8) * stride2 * stride1;
++ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
++ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
++ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
++ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
++ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
++ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
++
++// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
++ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
++ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
++ }
++#endif
++}
++
++// When bits are delivered to deblock we want them
++//#define TL 1
++//#define TR 2
++//#define BL 4
++//#define BR 8
++
++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
++// so we need to rearrange before passing on
++
++static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++ return (pcm[0] |
++ (pcm[1] << 8) |
++ (pcm[s->ps.sps->pcm_width] << 16) |
++ (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
++}
++
++static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
++}
++
++// We cast away const here as we want this to work for both get and set
++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++ return (uint32_t *)(bs +
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#warning Unexpected masks
++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++ return (uint8_t *)(bs +
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++
++// Get block strength
++// Given how we call we will always get within the 32bit boundries
++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
++ unsigned int xl, unsigned int xr, const unsigned int y)
++{
++ if (xr <= xl) {
++ return 0;
++ }
++ else
++ {
++#if HAVE_ARMV6T2_INLINE
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#error This case not yet handled in bs_get32
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++ uint32_t tmp;
++ __asm__ (
++ "lsr %[tmp], %[xl], %[xl_shift] \n\t"
++ "rsb %[xr], %[xl], %[xr] \n\t"
++ "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t"
++ "add %[xr], %[xr], #7 \n\t"
++ "lsr %[bs], %[y], %[y_shift1] \n\t"
++ "bic %[xr], %[xr], #7 \n\t"
++ "ubfx %[xl], %[xl], #1, #5 \n\t"
++ "lsr %[xr], %[xr], #1 \n\t"
++ "cmp %[xr], #32 \n\t"
++ "mvn %[tmp], #0 \n\t"
++ "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
++ "lsl %[tmp], %[tmp], %[xr] \n\t"
++ "lsr %[xl], %[bs], %[xl] \n\t"
++ "it ne \n\t"
++ "bicne %[bs], %[xl], %[tmp] \n\t"
++ : // Outputs
++ [bs]"+r"(bs),
++ [stride2]"+r"(stride2),
++ [xl]"+r"(xl),
++ [xr]"+r"(xr),
++ [tmp]"=&r"(tmp)
++ : // Inputs
++ [y]"r"(y),
++ [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
++ [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
++ [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++ : // Clobbers
++ "cc"
++ );
++ return (uint32_t) bs;
++#else
++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
++
++ return n == 32 ? a :
++ (a >> ((xl >> 1) & 31)) & ~(~0U << n);
++#endif
++ }
++}
++
++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
++}
++
++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
++}
++
++
++static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const unsigned int ctb_size = (1 << log2_ctb_size);
++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 1);
++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++ const DBParams * cb_dbp = s->deblock + ctb_n;
++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++
++ unsigned int cb_x;
++
++ // Do in CTB-shaped blocks
++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
++ {
++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++ const unsigned int bv_l = FFMAX(cb_x, 8);
++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
++ const unsigned int bh_l = bv_l - 8;
++ unsigned int y;
++
++ // Main body
++ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
++ {
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
++
++ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++ if (vbs != 0)
++ {
++ const uint8_t * const tcv = tctable + dbp->tc_offset;
++ const uint8_t * const betav = betatable + dbp->beta_offset;
++ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++ unsigned int x;
++
++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
++ {
++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
++ {
++ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betav[qp],
++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
++ pcmfa & 3,
++ av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
++ }
++ }
++ }
++
++ if (y != 0)
++ {
++ uint32_t hbs;
++
++ // H left - mostly separated out so we only need a uint32_t hbs
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
++ {
++ const unsigned int x = bh_l;
++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const DBParams * const dbph = dbp - 1;
++ const uint8_t * const tc = tctable + dbph->tc_offset + qp;
++
++ av_assert2(cb_x - bh_l == 8);
++
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbph->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
++
++ // H
++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1);
++
++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
++ {
++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
++ {
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + dbp->tc_offset + qp;
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbp->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
++ }
++ }
++ }
++
++ }
++ }
++}
++
++static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++ return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
++}
++
++static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const unsigned int ctb_size = (1 << log2_ctb_size);
++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 8);
++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++ const DBParams * dbp = s->deblock + ctb_n;
++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++ const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
++ const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
++
++ unsigned int cb_x;
++
++ av_assert1((bounds.x & (ctb_size - 1)) == 0);
++ av_assert1((bounds.y & (ctb_size - 1)) == 0);
++ av_assert1(bounds.h <= ctb_size);
++
++ // Do in CTB-shaped blocks
++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++ const unsigned int bv_l = FFMAX(cb_x, 16);
++ unsigned int y;
++
++ // V above
++ if (bounds.y != 0) {
++ // Deblock V up 8
++ // CTB above current
++ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
++ const unsigned int y = bounds.y - 8;
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
++
++ if (vbs != 0)
++ {
++ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
++ unsigned int x;
++
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++ {
++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
++ {
++ const int qp0 = q2h(s, x, y);
++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++ pcmfa & 3);
++ }
++ }
++ }
++ }
++
++ for (y = bounds.y; y < b_b; y += 16)
++ {
++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
++
++ // V
++ if (vbs != 0)
++ {
++ unsigned int x;
++ unsigned int pcmfa =
++ (y + 16 > b_b ?
++ pcm2(s, bv_l - 1, y) | 0xffff0000 :
++ pcm4(s, bv_l - 1, y));
++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++ {
++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const int qp0 = q2h(s, x, y);
++ const int qp1 = q2h(s, x, y + 8);
++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++ }
++
++ // H
++ if (y != 0)
++ {
++ uint32_t hbs;
++ const unsigned int bh_l = bv_l - 16;
++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++ // H left - mostly separated out so we only need a uint32_t hbs
++ // Stub is width 8 to the left of bounds, but width 16 internally
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
++ {
++ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++
++ // Chop off bits we don't want...
++ if (bh_l < bounds.x) {
++ pcmfa |= 0x10001; // TL|BL pre rearrangement
++ hbs &= ~3; // Make BS 0
++ }
++
++ // Double check we still want this
++ if (hbs != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const unsigned int x = bh_l;
++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
++
++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++
++ // H main
++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it
++
++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
++ {
++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++ }
++ }
++ }
++ }
++}
++
++static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
++{
++ return x & ~(~0U << log2_n);
++}
++
++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++ av_assert2((y & 7) == 0);
++
++ // This doesn't have the same simultainious update issues that bsf_stash
++ // does (other threads will have a different y) so we can do it the easy way
++ if ((bsf &= mask) != 0)
++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
++}
++
++
++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++ // We arrange this in a slightly odd fashion but it lines up with
++ // how we are going to use it in the actual deblock code & it is easier
++ // to do the contortions here than there
++ //
++ // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
++
++ av_assert2((x & 7) == 0);
++
++ if ((bsf &= mask) != 0)
++ {
++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
++ const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
++
++ if (mask <= 0xf)
++ {
++ *p |= (bsf << sh);
++ }
++ else
++ {
++ do {
++ *p |= (bsf & 0xf) << sh;
++ p += HEVC_RPI_BS_STRIDE1_BYTES;
++ } while ((bsf >>= 4) != 0);
++ }
++ }
++}
++
++static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
++ const unsigned int rep, const unsigned int dup,
++ const unsigned int mvf_stride0,
++ const unsigned int mvf_stride1,
++ const RefPicList * const rpl_p, const RefPicList * const rpl_q,
++ const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
++{
++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
++ mvf_p, mvf_q,
++ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
++ sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
++}
++
++
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
++ const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_trafo_size,
++ const int is_coded_block)
++{
++ const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0);
++ const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
++ const RefPicList * const rpl = s->refPicList;
++ // Rep count for bsf_mv when running with min_pu chuncks
++ const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
++ const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
++ const unsigned int trafo_size = (1U << log2_trafo_size);
++ const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
++ const uint32_t bsf_cbf = (bsf_mask & 0x55555555);
++
++ // Do we cover a pred split line?
++ const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
++ const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
++
++ uint32_t bsf_h;
++ uint32_t bsf_v;
++
++#ifdef DISABLE_STRENGTHS
++ return;
++#endif
++
++ // We are always on a size boundary
++ av_assert2((x0 & (trafo_size - 1)) == 0);
++ av_assert2((y0 & (trafo_size - 1)) == 0);
++ // log2_trafo_size not really a transform size; we can have to deal
++ // with size 2^6 blocks
++ av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
++
++ // Retrieve and update coded (b0), intra (b1) bs flags
++ //
++ // Store on min width (rather than uint32_t) to avoid possible issues
++ // with another thread on another core running wpp using the same
++ // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
++ //
++ // In bsf BS=2 is represented by 3 as it is much easier to test & set
++ // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
++ // 3 will work the same
++ {
++ // Given where we are called from is_cbf_luma & is_intra will be constant over the block
++ const uint32_t bsf0 = (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
++ uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
++ uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
++
++ switch (log2_trafo_size)
++ {
++ case 2:
++ case 3:
++ {
++ const unsigned int sh_h = (x0 >> 1) & 7;
++ const unsigned int sh_v = (y0 >> 1) & 7;
++ bsf_h = *p;
++ bsf_v = *q;
++ *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
++ *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
++ bsf_h >>= sh_h;
++ bsf_v >>= sh_v;
++ break;
++ }
++ case 4:
++ bsf_h = *p;
++ bsf_v = *q;
++ *p = bsf0;
++ *q = bsf0;
++ break;
++ case 5:
++ bsf_h = *(uint16_t *)p;
++ bsf_v = *(uint16_t *)q;
++ *(uint16_t *)p = bsf0;
++ *(uint16_t *)q = bsf0;
++ break;
++ case 6:
++ default:
++ bsf_h = *(uint32_t *)p;
++ bsf_v = *(uint32_t *)q;
++ *(uint32_t *)p = bsf0;
++ *(uint32_t *)q = bsf0;
++ break;
++ }
++
++ bsf_h |= bsf0;
++ bsf_v |= bsf0;
++ }
++
++ // Do Horizontal
++ if ((y0 & 7) == 0)
++ {
++ // Boundary upper
++ if (y0 != 0 &&
++ (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
++ (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
++ {
++ // Look at MVs (BS=1) if we don't already has a full set of bs bits
++ if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
++ {
++ // If we aren't on the top boundary we must be in the middle
++ // and in that case we know where mvf can change
++ const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
++ const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
++ s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
++ rpl;
++
++ bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ rpl, rpl_top,
++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
++ }
++
++ // Finally put the results into bs
++ hbs_set(s, x0, y0, bsf_mask, bsf_h);
++ }
++
++ // Max of 1 pu internal split - ignore if not on 8pel boundary
++ if (has_y_split && !off_boundary(lc->cu.y_split, 3))
++ {
++ const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
++ // If we have the x split as well then it must be in the middle
++ const unsigned int log2_rep = has_x_split ? 1 : 0;
++
++ hbs_set(s, x0, lc->cu.y_split, bsf_mask,
++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ rpl, rpl,
++ mvf, mvf - MVF_STASH_WIDTH_PU));
++ }
++ }
++
++ // And again for vertical - same logic as horizontal just in the other direction
++ if ((x0 & 7) == 0)
++ {
++ // Boundary left
++ if (x0 != 0 &&
++ (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
++ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
++ {
++ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
++ {
++ const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
++ const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
++ s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
++ rpl;
++
++ bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ rpl, rpl_left,
++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
++ }
++
++ vbs_set(s, x0, y0, bsf_mask, bsf_v);
++ }
++
++ if (has_x_split && !off_boundary(lc->cu.x_split, 3))
++ {
++ const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
++ const unsigned int log2_rep = has_y_split ? 1 : 0;
++
++ vbs_set(s, lc->cu.x_split, y0, bsf_mask,
++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ rpl, rpl,
++ mvf, mvf - 1));
++ }
++ }
++}
++
++#undef LUMA
++#undef CB
++#undef CR
++
++static inline unsigned int ussub(const unsigned int a, const unsigned int b)
++{
++ return a < b ? 0 : a - b;
++}
++
++static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
++{
++ return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
++}
++
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
++{
++ const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++ int x, y;
++
++ const unsigned int br = bounds.x + bounds.w;
++ const unsigned int bb = bounds.y + bounds.h;
++
++ const int x_end = (br >= s->ps.sps->width);
++ const int y_end = (bb >= s->ps.sps->height);
++
++ // Deblock may not touch the edges of the bound as they are still needed
++ // for Intra pred
++ //
++ // Deblock is disabled with a per-slice flag
++ // Given that bounds may cover multiple slices & we dblock outside bounds
++ // anyway we can't avoid deblock using that flag - about the only thing we
++ // could do is have a "no deblock seen yet" flag but it doesn't really
++ // seem worth the effort
++
++ deblock_y_blk(s, bounds, x_end, y_end);
++ deblock_uv_blk(s, bounds, x_end, y_end);
++
++ // SAO needs
++ // (a) CTB alignment
++ // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
++ {
++ const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
++ const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
++ const unsigned int yt = ussub(bounds.y, yo);
++ const unsigned int yb = y_end ? bb : ussub(bb, yo);
++ const unsigned int xl = ussub(bounds.x, xo);
++ const unsigned int xr = x_end ? br : ussub(br, xo);
++
++ if (s->ps.sps->sao_enabled)
++ {
++ for (y = yt; y < yb; y += ctb_size) {
++ for (x = xl; x < xr; x += ctb_size) {
++ sao_filter_CTB(s, x, y);
++ }
++ }
++ }
++
++ // Cache invalidate
++ y = 0;
++ if (xr != 0 && yb != 0)
++ {
++ const unsigned int llen =
++ (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
++ const unsigned int mask = ~(llen - 1);
++ const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
++ const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
++ const unsigned int it = ussub(yt, 1);
++ const unsigned int ib = y_end ? bb : yb - 1;
++
++ if (il < ir) {
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++ il, it, ir - il, ib - it,
++ ctx_vshift(s, 1), 1, 1);
++
++ // If we have to commit the right hand tile boundry due to
++ // cache boundry considerations then at EoTile we must commit
++ // that boundry to bottom of tile (bounds)
++ if (ib != bb && ir == br && eot) {
++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++ br - 1, ib, 1, bb - ib,
++ ctx_vshift(s, 1), 1, 1);
++ }
++
++ rpi_cache_flush_finish(rfe);
++
++ if (x_end)
++ y = y_end ? INT_MAX : ib;
++
++// printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
++ }
++ }
++ }
++
++ return y;
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mv.h
+@@ -0,0 +1,71 @@
++#ifndef AVCODEC_RPI_HEVC_MV_H
++#define AVCODEC_RPI_HEVC_MV_H
++
++#include "config.h"
++
++typedef int32_t MvXY;
++
++typedef struct HEVCRpiMvField {
++ MvXY xy[2];
++ int8_t ref_idx[2];
++ int8_t pred_flag;
++ int8_t dummy; // To 12 bytes
++} HEVCRpiMvField;
++
++
++#define MV_X(xy) (((xy) << 16) >> 16)
++#define MV_Y(xy) ((xy) >> 16)
++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_mv_arm.h"
++#endif
++
++#ifndef mvxy_add
++static inline MvXY mvxy_add(const MvXY a, const MvXY b)
++{
++ return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
++}
++#endif
++
++
++#ifndef mv_scale_xy
++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
++{
++ int tx, scale_factor;
++
++ td = td == 0 ? 1 : av_clip_int8(td);
++ tb = av_clip_int8(tb);
++ tx = (0x4000 + (abs(td) >> 1)) / td;
++ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
++ return MV_XY(
++ av_clip_int16((scale_factor * MV_X(src) + 127 +
++ (scale_factor * MV_X(src) < 0)) >> 8),
++ av_clip_int16((scale_factor * MV_Y(src) + 127 +
++ (scale_factor * MV_Y(src) < 0)) >> 8));
++}
++#endif
++
++// 8.3.1 states that the bitstream may not contain poc diffs that do not
++// fit in 16 bits, so given that we don't care about the high bits we only
++// store the low 16 + LT & Inter flags
++
++#define COL_POC_INTRA 0
++#define COL_POC_INTER (1 << 16)
++#define COL_POC_LT (1 << 17)
++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
++
++typedef struct ColMv_s {
++ int32_t poc;
++ int32_t xy;
++} ColMv;
++
++typedef struct ColMvField_s {
++ ColMv L[2];
++} ColMvField;
++
++
++
++#endif // AVCODEC_RPI_HEVC_MV_H
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mvs.c
+@@ -0,0 +1,487 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Anand Meher Kotra
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++static av_always_inline int
++is_eq_mer(const unsigned int plevel,
++ const unsigned int xN, const unsigned int yN,
++ const unsigned int xP, const unsigned int yP)
++{
++ return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
++}
++
++// check if the mv's and refidx are the same between A and B
++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++ return a->pred_flag == b->pred_flag &&
++ ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
++ ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
++ return 0;
++}
++
++/*
++ * 8.5.3.1.7 temporal luma motion vector prediction
++ */
++static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
++ const HEVCRpiLocalContext * const lc, const int x0, const int y0,
++ const int nPbW, const int nPbH, const int refIdxLx,
++ MvXY * const mvLXCol, const int X)
++{
++ int x, y;
++ const ColMv * cmv = NULL;
++
++ HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
++ const RefPicList * const refPicList = s->refPicList + X;
++ const int cur_lt = refPicList->isLongTerm[refIdxLx];
++
++ *mvLXCol = 0;
++ // Unlikely but we might have a col_ref IDR frame!
++ if (col_ref->col_mvf == NULL)
++ return 0;
++
++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
++
++ //bottom right collocated motion vector
++ x = x0 + nPbW;
++ y = y0 + nPbH;
++
++ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++ y < s->ps.sps->height &&
++ x < s->ps.sps->width)
++ {
++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++ (y >> 4) * s->col_mvf_stride;
++
++ if (col->L[0].poc != COL_POC_INTRA &&
++ (col->L[1].poc == COL_POC_INTRA ||
++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++ {
++ cmv = col->L + 0;
++ }
++ else if (col->L[1].poc != COL_POC_INTRA)
++ {
++ cmv = col->L + 1;
++ }
++ }
++
++ // derive center collocated motion vector
++ if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
++ {
++ cmv = NULL;
++ x = x0 + (nPbW >> 1);
++ y = y0 + (nPbH >> 1);
++
++ {
++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++ (y >> 4) * s->col_mvf_stride;
++
++ if (col->L[0].poc != COL_POC_INTRA &&
++ (col->L[1].poc == COL_POC_INTRA ||
++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++ {
++ cmv = col->L + 0;
++ }
++ else if (col->L[1].poc != COL_POC_INTRA)
++ {
++ cmv = col->L + 1;
++ }
++ }
++ }
++
++ if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
++ return 0;
++
++ {
++ const int col_poc = col_ref->poc;
++ const int ref_poc = refPicList->list[refIdxLx];
++
++ *mvLXCol = (cur_lt ||
++ cmv->poc == col_poc ||
++ COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
++ cmv->xy :
++ mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
++ }
++
++ return cmv != NULL;
++}
++
++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++ return b != NULL && compare_mv_ref_idx(a, b);
++}
++
++
++
++/*
++ * 8.5.3.1.2 Derivation process for spatial merging candidates
++ */
++static inline const HEVCRpiMvField *
++derive_spatial_merge_candidates(
++ const HEVCRpiContext * const s,
++ const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int avail,
++ const unsigned int part_idx,
++ const unsigned int merge_idx,
++ HEVCRpiMvField * const mvf_t)
++{
++ const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
++ const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
++
++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++ const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
++ const unsigned int part_mode = lc->cu.part_mode;
++
++ const HEVCRpiMvField * perm[4];
++ unsigned int nb_merge_cand = 0;
++
++ // singleMCLFlag => part_idx == 0 so no need to test for it
++ if ((avail & AVAIL_L) == 0 ||
++ (part_idx == 1 &&
++ ((parts_a1 >> part_mode) & 1) != 0 ||
++ is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
++ mvf_a1->pred_flag == PF_INTRA)
++ {
++ mvf_a1 = NULL;
++ }
++ else
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_a1;
++ perm[nb_merge_cand++] = mvf_a1;
++ }
++
++ if ((avail & AVAIL_U) == 0 ||
++ (part_idx == 1 &&
++ ((parts_b1 >> part_mode) & 1) != 0 ||
++ is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
++ mvf_b1->pred_flag == PF_INTRA)
++ {
++ mvf_b1 = NULL;
++ }
++ else if (!mvf_eq(mvf_b1, mvf_a1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_b1;
++ perm[nb_merge_cand++] = mvf_b1;
++ }
++
++ // above right spatial merge candidate
++ // Never need mvf_b0 again so don't bother zeroing if navail
++ if ((avail & AVAIL_UR) != 0 &&
++ !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
++ mvf_b0->pred_flag != PF_INTRA &&
++ !mvf_eq(mvf_b0, mvf_b1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_b0;
++ perm[nb_merge_cand++] = mvf_b0;
++ }
++
++ // left bottom spatial merge candidate
++ // Never need mvf_a0 again so don't bother zeroing if navail
++ if ((avail & AVAIL_DL) != 0 &&
++ !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
++ mvf_a0->pred_flag != PF_INTRA &&
++ !mvf_eq(mvf_a0, mvf_a1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_a0;
++ perm[nb_merge_cand++] = mvf_a0;
++ }
++
++ // above left spatial merge candidate
++ if (nb_merge_cand != 4 &&
++ (avail & AVAIL_UL) != 0 &&
++ !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
++ {
++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL
++
++ if (mvf_b2->pred_flag != PF_INTRA &&
++ !mvf_eq(mvf_b2, mvf_a1) &&
++ !mvf_eq(mvf_b2, mvf_b1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_b2;
++ perm[nb_merge_cand++] = mvf_b2;
++ }
++ }
++
++ // temporal motion vector candidate
++ if (s->sh.slice_temporal_mvp_enabled_flag)
++ {
++ static const HEVCRpiMvField mvf_z = {{0}};
++
++ *mvf_t = mvf_z;
++
++ if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++ 0, mvf_t->xy + 0, 0))
++ mvf_t->pred_flag = PF_L0;
++
++ if (s->sh.slice_type == HEVC_SLICE_B &&
++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++ 0, mvf_t->xy + 1, 1))
++ mvf_t->pred_flag |= PF_L1;
++
++ if (mvf_t->pred_flag != 0)
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_t;
++ perm[nb_merge_cand++] = mvf_t;
++ }
++ }
++
++ // combined bi-predictive merge candidates (applies for B slices)
++ if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
++ {
++ unsigned int comb_idx = 0;
++ const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
++ const RefPicList * const refPicList = s->refPicList;
++
++ for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
++ {
++ static const uint8_t l0_l1_cand_idx[12][2] = {
++ { 0, 1, },
++ { 1, 0, },
++ { 0, 2, },
++ { 2, 0, },
++ { 1, 2, },
++ { 2, 1, },
++ { 0, 3, },
++ { 3, 0, },
++ { 1, 3, },
++ { 3, 1, },
++ { 2, 3, },
++ { 3, 2, },
++ };
++
++ const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
++ const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
++ const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
++ const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
++
++ if ((mvf_c0->pred_flag & PF_L0) != 0 &&
++ (mvf_c1->pred_flag & PF_L1) != 0 &&
++ (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
++ mvf_c0->xy[0] != mvf_c1->xy[1]))
++ {
++ if (merge_idx == nb_merge_cand++)
++ {
++ // Need to be a bit careful as we will construct mvf_t and we
++ // may already be using that as one of our condidates
++ // so build & copy rather than build in place
++ const HEVCRpiMvField mvf_m = {
++ .xy = {
++ mvf_c0->xy[0],
++ mvf_c1->xy[1]},
++ .ref_idx = {
++ mvf_c0->ref_idx[0],
++ mvf_c1->ref_idx[1]},
++ .pred_flag = PF_BI
++ };
++ *mvf_t = mvf_m;
++ return mvf_t;
++ }
++ }
++ }
++ }
++
++ // "append" Zero motion vector candidates
++ {
++ const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
++ FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
++ const unsigned int zero_idx = merge_idx - nb_merge_cand;
++
++ const HEVCRpiMvField mvf_m = {
++ .xy = {0, 0},
++ .ref_idx = {
++ zero_idx < nb_refs ? zero_idx : 0,
++ (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
++ .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
++ };
++
++ *mvf_t = mvf_m;
++ return mvf_t;
++ }
++}
++
++
++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++ int nPbH, int log2_cb_size, int part_idx,
++ int merge_idx, HEVCRpiMvField * const mv)
++{
++ const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
++ derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
++ ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
++ 0, merge_idx, mv) :
++ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
++ part_idx, merge_idx, mv);
++
++ if (mvf_m != mv)
++ *mv = *mvf_m;
++
++ if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
++ mv->pred_flag = PF_L0;
++}
++
++
++static av_always_inline const MvXY *
++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
++{
++ if (mvf != NULL)
++ {
++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
++ return mvf->xy + pfi0;
++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
++ return mvf->xy + pfi1;
++ }
++ return NULL;
++}
++
++static av_always_inline const MvXY *
++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
++ const int islt0, const int poc0, const int poc_cur,
++ MvXY * const mv_t, const HEVCRpiMvField * const mvf)
++{
++ if (mvf != NULL)
++ {
++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
++ {
++ const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
++ if (islt0 || poc1 == poc0) {
++ return mvf->xy + pfi0;
++ }
++ *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
++ return mv_t;
++ }
++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
++ {
++ const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
++ if (islt0 || poc1 == poc0) {
++ return mvf->xy + pfi1;
++ }
++ *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
++ return mv_t;
++ }
++ }
++ return NULL;
++}
++
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int avail,
++ HEVCRpiMvField * const mv,
++ const unsigned int mvp_lx_flag, const unsigned int LX)
++{
++ const unsigned int pfi0 = LX;
++ const unsigned int pfi1 = LX == 0 ? 1 : 0;
++ const RefPicList * const rpl = s->refPicList;
++ const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
++ const int poc_cur = s->poc;
++ const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
++
++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL
++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++ const MvXY * mva = NULL;
++ const MvXY * mvb;
++ MvXY * const mv_rv = mv->xy + LX;
++ MvXY mvt_a, mvt_b;
++
++ *mv_rv = 0;
++
++ if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
++ mvf_a0 = NULL;
++ else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
++ goto use_mva;
++
++ if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
++ mvf_a1 = NULL;
++
++ if (mva == NULL &&
++ (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
++ (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
++ mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
++
++ if (mvp_lx_flag == 0 && mva != NULL)
++ goto use_mva;
++
++ if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
++ mvf_b0 = NULL;
++ if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
++ mvf_b1 = NULL;
++ if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
++ mvf_b2 = NULL;
++
++ if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
++ (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
++ mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
++
++ if (mvf_a0 == NULL && mvf_a1 == NULL) {
++ mva = mvb;
++ if (mvp_lx_flag == 0 && mva != NULL)
++ goto use_mva;
++
++ if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
++ (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
++ mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
++ }
++
++ if (mva == NULL) {
++ mva = mvb;
++ mvb = NULL;
++ }
++
++ if (mvb != NULL && *mva == *mvb) // If A == B then ignore B
++ mvb = NULL;
++
++ if (mvp_lx_flag == 0 && mva != NULL) {
++ goto use_mva;
++ }
++ else if (mvp_lx_flag != 0 && mvb != NULL) {
++ *mv_rv = *mvb;
++ }
++ else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
++ nPbH, mv->ref_idx[LX],
++ mv_rv, LX);
++ }
++ return;
++
++use_mva:
++ *mv_rv = *mva;
++ return;
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.c
+@@ -0,0 +1,143 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "bytestream.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_parse.h"
++
++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
++ HEVCSEIContext *sei, int is_nalff, int nal_length_size,
++ int err_recognition, int apply_defdispwin, void *logctx)
++{
++ int i;
++ int ret = 0;
++ H2645Packet pkt = { 0 };
++
++ ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
++ nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
++ if (ret < 0) {
++ goto done;
++ }
++
++ for (i = 0; i < pkt.nb_nals; i++) {
++ H2645NAL *nal = &pkt.nals[i];
++
++ /* ignore everything except parameter sets and VCL NALUs */
++ switch (nal->type) {
++ case HEVC_NAL_VPS:
++ ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
++ if (ret < 0)
++ goto done;
++ break;
++ case HEVC_NAL_SPS:
++ ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
++ if (ret < 0)
++ goto done;
++ break;
++ case HEVC_NAL_PPS:
++ ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
++ if (ret < 0)
++ goto done;
++ break;
++ case HEVC_NAL_SEI_PREFIX:
++ case HEVC_NAL_SEI_SUFFIX:
++ ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
++ if (ret < 0)
++ goto done;
++ break;
++ default:
++ av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
++ break;
++ }
++ }
++
++done:
++ ff_h2645_packet_uninit(&pkt);
++ if (err_recognition & AV_EF_EXPLODE)
++ return ret;
++
++ return 0;
++}
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++ int err_recognition, int apply_defdispwin, void *logctx)
++{
++ int ret = 0;
++ GetByteContext gb;
++
++ bytestream2_init(&gb, data, size);
++
++ if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
++ /* It seems the extradata is encoded as hvcC format.
++ * Temporarily, we support configurationVersion==0 until 14496-15 3rd
++ * is finalized. When finalized, configurationVersion will be 1 and we
++ * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
++ int i, j, num_arrays, nal_len_size;
++
++ *is_nalff = 1;
++
++ bytestream2_skip(&gb, 21);
++ nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
++ num_arrays = bytestream2_get_byte(&gb);
++
++ /* nal units in the hvcC always have length coded with 2 bytes,
++ * so put a fake nal_length_size = 2 while parsing them */
++ *nal_length_size = 2;
++
++ /* Decode nal units from hvcC. */
++ for (i = 0; i < num_arrays; i++) {
++ int type = bytestream2_get_byte(&gb) & 0x3f;
++ int cnt = bytestream2_get_be16(&gb);
++
++ for (j = 0; j < cnt; j++) {
++ // +2 for the nal size field
++ int nalsize = bytestream2_peek_be16(&gb) + 2;
++ if (bytestream2_get_bytes_left(&gb) < nalsize) {
++ av_log(logctx, AV_LOG_ERROR,
++ "Invalid NAL unit size in extradata.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
++ *nal_length_size, err_recognition, apply_defdispwin,
++ logctx);
++ if (ret < 0) {
++ av_log(logctx, AV_LOG_ERROR,
++ "Decoding nal unit %d %d from hvcC failed\n",
++ type, i);
++ return ret;
++ }
++ bytestream2_skip(&gb, nalsize);
++ }
++ }
++
++ /* Now store right nal length size, that will be used to parse
++ * all other nals */
++ *nal_length_size = nal_len_size;
++ } else {
++ *is_nalff = 0;
++ ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
++ err_recognition, apply_defdispwin, logctx);
++ if (ret < 0)
++ return ret;
++ }
++
++ return ret;
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.h
+@@ -0,0 +1,36 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * H.265 parser code
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PARSE_H
++#define AVCODEC_RPI_HEVC_PARSE_H
++
++#include <stdint.h>
++
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++ int err_recognition, int apply_defdispwin, void *logctx);
++
++#endif /* AVCODEC_RPI_HEVC_PARSE_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.c
+@@ -0,0 +1,1938 @@
++/*
++ * HEVC Parameter Set decoding
++ *
++ * Copyright (C) 2012 - 2103 Guillaume Martres
++ * Copyright (C) 2012 - 2103 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/imgutils.h"
++#include "golomb.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevcdec.h"
++
++static const uint8_t default_scaling_list_intra[] = {
++ 16, 16, 16, 16, 17, 18, 21, 24,
++ 16, 16, 16, 16, 17, 19, 22, 25,
++ 16, 16, 17, 18, 20, 22, 25, 29,
++ 16, 16, 18, 21, 24, 27, 31, 36,
++ 17, 17, 20, 24, 30, 35, 41, 47,
++ 18, 19, 22, 27, 35, 44, 54, 65,
++ 21, 22, 25, 31, 41, 54, 70, 88,
++ 24, 25, 29, 36, 47, 65, 88, 115
++};
++
++static const uint8_t default_scaling_list_inter[] = {
++ 16, 16, 16, 16, 17, 18, 20, 24,
++ 16, 16, 16, 17, 18, 20, 24, 25,
++ 16, 16, 17, 18, 20, 24, 25, 28,
++ 16, 17, 18, 20, 24, 25, 28, 33,
++ 17, 18, 20, 24, 25, 28, 33, 41,
++ 18, 20, 24, 25, 28, 33, 41, 54,
++ 20, 24, 25, 28, 33, 41, 54, 71,
++ 24, 25, 28, 33, 41, 54, 71, 91
++};
++
++static const AVRational vui_sar[] = {
++ { 0, 1 },
++ { 1, 1 },
++ { 12, 11 },
++ { 10, 11 },
++ { 16, 11 },
++ { 40, 33 },
++ { 24, 11 },
++ { 20, 11 },
++ { 32, 11 },
++ { 80, 33 },
++ { 18, 11 },
++ { 15, 11 },
++ { 64, 33 },
++ { 160, 99 },
++ { 4, 3 },
++ { 3, 2 },
++ { 2, 1 },
++};
++
++
++// pps_cb_qp_offset: -12,+12
++// slice_cb_qp_offset: -12,+12 also
++// "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
++// cr_qp_offset_list[n]: -12,+12
++// So worst case total offset: -24,+24
++
++#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
++#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
++#define M(B,n) C(B,(-n))
++
++// Sizeof the QP_START_BLOCK
++#define QP_OFFSET_0 (8*6 + 12*2)
++#define QP_START(B) \
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++\
++ M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
++ M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
++ M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
++ M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
++ M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
++ M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
++ M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
++ M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
++#define QP_END(B) \
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
++
++#define T1(B)\
++{\
++ QP_START(B),\
++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++ C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
++ C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
++ C(B,44), C(B,45),\
++ C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
++ QP_END(B)\
++}
++#define T0(B)\
++{\
++ QP_START(B),\
++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++ C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
++ C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
++ C(B,50), C(B,51),\
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++ QP_END(B)\
++}
++
++#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
++
++static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
++static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
++
++#undef T
++#undef C
++#undef QP_END
++
++#define C(B,n) ((n)<0?0:(n)>51?51:(n))
++// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
++#define QP_DBLK_OFFSET_0 QP_OFFSET_0
++#define QP_END(B)\
++ 51, 51, 51, 51, 51, 51
++
++// These don't need all the padding we have here (12 top/bottom would be enough)
++static const uint8_t qp_c_dblk_0[] = T0(0);
++static const uint8_t qp_c_dblk_1[] = T1(0);
++
++#undef T
++#undef M
++#undef C
++#undef QP_END
++#undef QP_START
++
++
++static void remove_pps(HEVCRpiParamSets * const s, const int id)
++{
++ if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
++ s->pps = NULL;
++ av_buffer_unref(&s->pps_list[id]);
++}
++
++static void remove_sps(HEVCRpiParamSets * const s, const int id)
++{
++ int i;
++ if (s->sps_list[id]) {
++ if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
++ s->sps = NULL;
++
++ /* drop all PPS that depend on this SPS */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
++ if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
++ remove_pps(s, i);
++
++ av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
++ }
++ av_buffer_unref(&s->sps_list[id]);
++}
++
++static void remove_vps(HEVCRpiParamSets * const s, const int id)
++{
++ int i;
++ if (s->vps_list[id]) {
++ if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
++ s->vps = NULL;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
++ if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
++ remove_sps(s, i);
++ }
++ av_buffer_unref(&s->vps_list[id]);
++}
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
++ ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
++{
++ uint8_t rps_predict = 0;
++ int delta_poc;
++ int k0 = 0;
++ int k1 = 0;
++ int k = 0;
++ int i;
++
++ if (rps != sps->st_rps && sps->nb_st_rps)
++ rps_predict = get_bits1(gb);
++
++ if (rps_predict) {
++ const ShortTermRPS *rps_ridx;
++ int delta_rps;
++ unsigned abs_delta_rps;
++ uint8_t use_delta_flag = 0;
++ uint8_t delta_rps_sign;
++
++ if (is_slice_header) {
++ unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
++ if (delta_idx > sps->nb_st_rps) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
++ delta_idx, sps->nb_st_rps);
++ return AVERROR_INVALIDDATA;
++ }
++ rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
++ rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
++ } else
++ rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
++
++ delta_rps_sign = get_bits1(gb);
++ abs_delta_rps = get_ue_golomb_long(gb) + 1;
++ if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of abs_delta_rps: %d\n",
++ abs_delta_rps);
++ return AVERROR_INVALIDDATA;
++ }
++ delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
++ for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
++ int used = rps->used[k] = get_bits1(gb);
++
++ if (!used)
++ use_delta_flag = get_bits1(gb);
++
++ if (used || use_delta_flag) {
++ if (i < rps_ridx->num_delta_pocs)
++ delta_poc = delta_rps + rps_ridx->delta_poc[i];
++ else
++ delta_poc = delta_rps;
++ rps->delta_poc[k] = delta_poc;
++ if (delta_poc < 0)
++ k0++;
++ else
++ k1++;
++ k++;
++ }
++ }
++
++ if (k >= FF_ARRAY_ELEMS(rps->used)) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid num_delta_pocs: %d\n", k);
++ return AVERROR_INVALIDDATA;
++ }
++
++ rps->num_delta_pocs = k;
++ rps->num_negative_pics = k0;
++ // sort in increasing order (smallest first)
++ if (rps->num_delta_pocs != 0) {
++ int used, tmp;
++ for (i = 1; i < rps->num_delta_pocs; i++) {
++ delta_poc = rps->delta_poc[i];
++ used = rps->used[i];
++ for (k = i - 1; k >= 0; k--) {
++ tmp = rps->delta_poc[k];
++ if (delta_poc < tmp) {
++ rps->delta_poc[k + 1] = tmp;
++ rps->used[k + 1] = rps->used[k];
++ rps->delta_poc[k] = delta_poc;
++ rps->used[k] = used;
++ }
++ }
++ }
++ }
++ if ((rps->num_negative_pics >> 1) != 0) {
++ int used;
++ k = rps->num_negative_pics - 1;
++ // flip the negative values to largest first
++ for (i = 0; i < rps->num_negative_pics >> 1; i++) {
++ delta_poc = rps->delta_poc[i];
++ used = rps->used[i];
++ rps->delta_poc[i] = rps->delta_poc[k];
++ rps->used[i] = rps->used[k];
++ rps->delta_poc[k] = delta_poc;
++ rps->used[k] = used;
++ k--;
++ }
++ }
++ } else {
++ unsigned int prev, nb_positive_pics;
++ rps->num_negative_pics = get_ue_golomb_long(gb);
++ nb_positive_pics = get_ue_golomb_long(gb);
++
++ if (rps->num_negative_pics >= HEVC_MAX_REFS ||
++ nb_positive_pics >= HEVC_MAX_REFS) {
++ av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
++ if (rps->num_delta_pocs) {
++ prev = 0;
++ for (i = 0; i < rps->num_negative_pics; i++) {
++ delta_poc = get_ue_golomb_long(gb) + 1;
++ if (delta_poc < 1 || delta_poc > 32768) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of delta_poc: %d\n",
++ delta_poc);
++ return AVERROR_INVALIDDATA;
++ }
++ prev -= delta_poc;
++ rps->delta_poc[i] = prev;
++ rps->used[i] = get_bits1(gb);
++ }
++ prev = 0;
++ for (i = 0; i < nb_positive_pics; i++) {
++ delta_poc = get_ue_golomb_long(gb) + 1;
++ if (delta_poc < 1 || delta_poc > 32768) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of delta_poc: %d\n",
++ delta_poc);
++ return AVERROR_INVALIDDATA;
++ }
++ prev += delta_poc;
++ rps->delta_poc[rps->num_negative_pics + i] = prev;
++ rps->used[rps->num_negative_pics + i] = get_bits1(gb);
++ }
++ }
++ }
++ return 0;
++}
++
++
++static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
++ PTLCommon * const ptl)
++{
++ int i;
++
++ if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
++ return -1;
++
++ ptl->profile_space = get_bits(gb, 2);
++ ptl->tier_flag = get_bits1(gb);
++ ptl->profile_idc = get_bits(gb, 5);
++ if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
++ av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
++ av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
++ av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
++ else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
++ av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
++ else
++ av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
++
++ for (i = 0; i < 32; i++) {
++ ptl->profile_compatibility_flag[i] = get_bits1(gb);
++
++ if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
++ ptl->profile_idc = i;
++ }
++ ptl->progressive_source_flag = get_bits1(gb);
++ ptl->interlaced_source_flag = get_bits1(gb);
++ ptl->non_packed_constraint_flag = get_bits1(gb);
++ ptl->frame_only_constraint_flag = get_bits1(gb);
++
++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
++ skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
++
++ return 0;
++}
++
++static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
++ PTL * const ptl, const int max_num_sub_layers)
++{
++ int i;
++ if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
++ get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
++ av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
++ return -1;
++ }
++
++ ptl->general_ptl.level_idc = get_bits(gb, 8);
++
++ for (i = 0; i < max_num_sub_layers - 1; i++) {
++ ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
++ ptl->sub_layer_level_present_flag[i] = get_bits1(gb);
++ }
++
++ if (max_num_sub_layers - 1> 0)
++ for (i = max_num_sub_layers - 1; i < 8; i++)
++ skip_bits(gb, 2); // reserved_zero_2bits[i]
++ for (i = 0; i < max_num_sub_layers - 1; i++) {
++ if (ptl->sub_layer_profile_present_flag[i] &&
++ decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "PTL information for sublayer %i too short\n", i);
++ return -1;
++ }
++ if (ptl->sub_layer_level_present_flag[i]) {
++ if (get_bits_left(gb) < 8) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Not enough data for sublayer %i level_idc\n", i);
++ return -1;
++ } else
++ ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
++ }
++ }
++
++ return 0;
++}
++
++static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
++ const int subpic_params_present)
++{
++ int i;
++
++ for (i = 0; i < nb_cpb; i++) {
++ get_ue_golomb_long(gb); // bit_rate_value_minus1
++ get_ue_golomb_long(gb); // cpb_size_value_minus1
++
++ if (subpic_params_present) {
++ get_ue_golomb_long(gb); // cpb_size_du_value_minus1
++ get_ue_golomb_long(gb); // bit_rate_du_value_minus1
++ }
++ skip_bits1(gb); // cbr_flag
++ }
++}
++
++static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
++ const int max_sublayers)
++{
++ int nal_params_present = 0, vcl_params_present = 0;
++ int subpic_params_present = 0;
++ int i;
++
++ if (common_inf_present) {
++ nal_params_present = get_bits1(gb);
++ vcl_params_present = get_bits1(gb);
++
++ if (nal_params_present || vcl_params_present) {
++ subpic_params_present = get_bits1(gb);
++
++ if (subpic_params_present) {
++ skip_bits(gb, 8); // tick_divisor_minus2
++ skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
++ skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
++ skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
++ }
++
++ skip_bits(gb, 4); // bit_rate_scale
++ skip_bits(gb, 4); // cpb_size_scale
++
++ if (subpic_params_present)
++ skip_bits(gb, 4); // cpb_size_du_scale
++
++ skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
++ skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
++ skip_bits(gb, 5); // dpb_output_delay_length_minus1
++ }
++ }
++
++ for (i = 0; i < max_sublayers; i++) {
++ int low_delay = 0;
++ unsigned int nb_cpb = 1;
++ int fixed_rate = get_bits1(gb);
++
++ if (!fixed_rate)
++ fixed_rate = get_bits1(gb);
++
++ if (fixed_rate)
++ get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1
++ else
++ low_delay = get_bits1(gb);
++
++ if (!low_delay) {
++ nb_cpb = get_ue_golomb_long(gb) + 1;
++ if (nb_cpb < 1 || nb_cpb > 32) {
++ av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ if (nal_params_present)
++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++ if (vcl_params_present)
++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++ }
++ return 0;
++}
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
++ HEVCRpiParamSets * const ps)
++{
++ int i,j;
++ int vps_id = 0;
++ ptrdiff_t nal_size;
++ HEVCRpiVPS *vps;
++ AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
++
++ if (!vps_buf)
++ return AVERROR(ENOMEM);
++ vps = (HEVCRpiVPS*)vps_buf->data;
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
++
++ nal_size = gb->buffer_end - gb->buffer;
++ if (nal_size > sizeof(vps->data)) {
++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++ nal_size, sizeof(vps->data));
++ vps->data_size = sizeof(vps->data);
++ } else {
++ vps->data_size = nal_size;
++ }
++ memcpy(vps->data, gb->buffer, vps->data_size);
++
++ vps_id = get_bits(gb, 4);
++ if (vps_id >= HEVC_MAX_VPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
++ goto err;
++ }
++
++ if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
++ goto err;
++ }
++
++ vps->vps_max_layers = get_bits(gb, 6) + 1;
++ vps->vps_max_sub_layers = get_bits(gb, 3) + 1;
++ vps->vps_temporal_id_nesting_flag = get_bits1(gb);
++
++ if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
++ goto err;
++ }
++
++ if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++ av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
++ vps->vps_max_sub_layers);
++ goto err;
++ }
++
++ if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
++ goto err;
++
++ vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
++
++ i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
++ for (; i < vps->vps_max_sub_layers; i++) {
++ vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
++ vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb);
++ vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1;
++
++ if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
++ av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
++ vps->vps_max_dec_pic_buffering[i] - 1);
++ goto err;
++ }
++ if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
++ av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
++ vps->vps_num_reorder_pics[i]);
++ if (avctx->err_recognition & AV_EF_EXPLODE)
++ goto err;
++ }
++ }
++
++ vps->vps_max_layer_id = get_bits(gb, 6);
++ vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
++ if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
++ (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
++ av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
++ goto err;
++ }
++
++ for (i = 1; i < vps->vps_num_layer_sets; i++)
++ for (j = 0; j <= vps->vps_max_layer_id; j++)
++ skip_bits(gb, 1); // layer_id_included_flag[i][j]
++
++ vps->vps_timing_info_present_flag = get_bits1(gb);
++ if (vps->vps_timing_info_present_flag) {
++ vps->vps_num_units_in_tick = get_bits_long(gb, 32);
++ vps->vps_time_scale = get_bits_long(gb, 32);
++ vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
++ if (vps->vps_poc_proportional_to_timing_flag)
++ vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
++ vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
++ if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
++ av_log(avctx, AV_LOG_ERROR,
++ "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
++ goto err;
++ }
++ for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
++ int common_inf_present = 1;
++
++ get_ue_golomb_long(gb); // hrd_layer_set_idx
++ if (i)
++ common_inf_present = get_bits1(gb);
++ decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
++ }
++ }
++ get_bits1(gb); /* vps_extension_flag */
++
++ if (get_bits_left(gb) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Overread VPS by %d bits\n", -get_bits_left(gb));
++ if (ps->vps_list[vps_id])
++ goto err;
++ }
++
++ if (ps->vps_list[vps_id] &&
++ !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
++ av_buffer_unref(&vps_buf);
++ } else {
++ remove_vps(ps, vps_id);
++ ps->vps_list[vps_id] = vps_buf;
++ }
++
++ return 0;
++
++err:
++ av_buffer_unref(&vps_buf);
++ return AVERROR_INVALIDDATA;
++}
++
++static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
++ const int apply_defdispwin, HEVCRpiSPS * const sps)
++{
++ VUI backup_vui, * const vui = &sps->vui;
++ GetBitContext backup;
++ int sar_present, alt = 0;
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
++
++ sar_present = get_bits1(gb);
++ if (sar_present) {
++ uint8_t sar_idx = get_bits(gb, 8);
++ if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
++ vui->sar = vui_sar[sar_idx];
++ else if (sar_idx == 255) {
++ vui->sar.num = get_bits(gb, 16);
++ vui->sar.den = get_bits(gb, 16);
++ } else
++ av_log(avctx, AV_LOG_WARNING,
++ "Unknown SAR index: %u.\n", sar_idx);
++ }
++
++ vui->overscan_info_present_flag = get_bits1(gb);
++ if (vui->overscan_info_present_flag)
++ vui->overscan_appropriate_flag = get_bits1(gb);
++
++ vui->video_signal_type_present_flag = get_bits1(gb);
++ if (vui->video_signal_type_present_flag) {
++ vui->video_format = get_bits(gb, 3);
++ vui->video_full_range_flag = get_bits1(gb);
++ vui->colour_description_present_flag = get_bits1(gb);
++ if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
++ sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
++ if (vui->colour_description_present_flag) {
++ vui->colour_primaries = get_bits(gb, 8);
++ vui->transfer_characteristic = get_bits(gb, 8);
++ vui->matrix_coeffs = get_bits(gb, 8);
++
++ // Set invalid values to "unspecified"
++ if (!av_color_primaries_name(vui->colour_primaries))
++ vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
++ if (!av_color_transfer_name(vui->transfer_characteristic))
++ vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
++ if (!av_color_space_name(vui->matrix_coeffs))
++ vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
++ if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
++ switch (sps->pix_fmt) {
++ case AV_PIX_FMT_YUV444P:
++ sps->pix_fmt = AV_PIX_FMT_GBRP;
++ break;
++ case AV_PIX_FMT_YUV444P10:
++ sps->pix_fmt = AV_PIX_FMT_GBRP10;
++ break;
++ case AV_PIX_FMT_YUV444P12:
++ sps->pix_fmt = AV_PIX_FMT_GBRP12;
++ break;
++ }
++ }
++ }
++ }
++
++ vui->chroma_loc_info_present_flag = get_bits1(gb);
++ if (vui->chroma_loc_info_present_flag) {
++ vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb);
++ vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
++ }
++
++ vui->neutra_chroma_indication_flag = get_bits1(gb);
++ vui->field_seq_flag = get_bits1(gb);
++ vui->frame_field_info_present_flag = get_bits1(gb);
++
++ // Backup context in case an alternate header is detected
++ memcpy(&backup, gb, sizeof(backup));
++ memcpy(&backup_vui, vui, sizeof(backup_vui));
++ if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
++ vui->default_display_window_flag = 0;
++ av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
++ } else
++ vui->default_display_window_flag = get_bits1(gb);
++
++ if (vui->default_display_window_flag) {
++ int vert_mult = 1 + (sps->chroma_format_idc < 2);
++ int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++ vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult;
++ vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult;
++ vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult;
++ vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult;
++
++ if (apply_defdispwin &&
++ avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++ av_log(avctx, AV_LOG_DEBUG,
++ "discarding vui default display window, "
++ "original values are l:%u r:%u t:%u b:%u\n",
++ vui->def_disp_win.left_offset,
++ vui->def_disp_win.right_offset,
++ vui->def_disp_win.top_offset,
++ vui->def_disp_win.bottom_offset);
++
++ vui->def_disp_win.left_offset =
++ vui->def_disp_win.right_offset =
++ vui->def_disp_win.top_offset =
++ vui->def_disp_win.bottom_offset = 0;
++ }
++ }
++
++timing_info:
++ vui->vui_timing_info_present_flag = get_bits1(gb);
++
++ if (vui->vui_timing_info_present_flag) {
++ if( get_bits_left(gb) < 66 && !alt) {
++ // The alternate syntax seem to have timing info located
++ // at where def_disp_win is normally located
++ av_log(avctx, AV_LOG_WARNING,
++ "Strange VUI timing information, retrying...\n");
++ memcpy(vui, &backup_vui, sizeof(backup_vui));
++ memcpy(gb, &backup, sizeof(backup));
++ alt = 1;
++ goto timing_info;
++ }
++ vui->vui_num_units_in_tick = get_bits_long(gb, 32);
++ vui->vui_time_scale = get_bits_long(gb, 32);
++ if (alt) {
++ av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
++ vui->vui_time_scale, vui->vui_num_units_in_tick);
++ }
++ vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
++ if (vui->vui_poc_proportional_to_timing_flag)
++ vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
++ vui->vui_hrd_parameters_present_flag = get_bits1(gb);
++ if (vui->vui_hrd_parameters_present_flag)
++ decode_hrd(gb, 1, sps->max_sub_layers);
++ }
++
++ vui->bitstream_restriction_flag = get_bits1(gb);
++ if (vui->bitstream_restriction_flag) {
++ if (get_bits_left(gb) < 8 && !alt) {
++ av_log(avctx, AV_LOG_WARNING,
++ "Strange VUI bitstream restriction information, retrying"
++ " from timing information...\n");
++ memcpy(vui, &backup_vui, sizeof(backup_vui));
++ memcpy(gb, &backup, sizeof(backup));
++ alt = 1;
++ goto timing_info;
++ }
++ vui->tiles_fixed_structure_flag = get_bits1(gb);
++ vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
++ vui->restricted_ref_pic_lists_flag = get_bits1(gb);
++ vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb);
++ vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb);
++ vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb);
++ vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb);
++ vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb);
++ }
++
++ if (get_bits_left(gb) < 1 && !alt) {
++ // XXX: Alternate syntax when sps_range_extension_flag != 0?
++ av_log(avctx, AV_LOG_WARNING,
++ "Overread in VUI, retrying from timing information...\n");
++ memcpy(vui, &backup_vui, sizeof(backup_vui));
++ memcpy(gb, &backup, sizeof(backup));
++ alt = 1;
++ goto timing_info;
++ }
++}
++
++static void set_default_scaling_list_data(ScalingList * const sl)
++{
++ int matrixId;
++
++ for (matrixId = 0; matrixId < 6; matrixId++) {
++ // 4x4 default is 16
++ memset(sl->sl[0][matrixId], 16, 16);
++ sl->sl_dc[0][matrixId] = 16; // default for 16x16
++ sl->sl_dc[1][matrixId] = 16; // default for 32x32
++ }
++
++ memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
++ memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
++ memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
++
++ memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
++ memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
++ memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
++
++ memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
++ memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
++ memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
++
++ memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
++ memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
++ memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
++
++ memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
++ memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
++ memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
++
++ memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
++ memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
++ memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
++}
++
++static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
++ const HEVCRpiSPS * const sps)
++{
++ uint8_t scaling_list_pred_mode_flag;
++ int32_t scaling_list_dc_coef[2][6];
++ int size_id, matrix_id, pos;
++ int i;
++
++ for (size_id = 0; size_id < 4; size_id++)
++ for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
++ scaling_list_pred_mode_flag = get_bits1(gb);
++ if (!scaling_list_pred_mode_flag) {
++ unsigned int delta = get_ue_golomb_long(gb);
++ /* Only need to handle non-zero delta. Zero means default,
++ * which should already be in the arrays. */
++ if (delta) {
++ // Copy from previous array.
++ delta *= (size_id == 3) ? 3 : 1;
++ if (matrix_id < delta) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid delta in scaling list data: %d.\n", delta);
++ return AVERROR_INVALIDDATA;
++ }
++
++ memcpy(sl->sl[size_id][matrix_id],
++ sl->sl[size_id][matrix_id - delta],
++ size_id > 0 ? 64 : 16);
++ if (size_id > 1)
++ sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
++ }
++ } else {
++ int next_coef, coef_num;
++ int32_t scaling_list_delta_coef;
++
++ next_coef = 8;
++ coef_num = FFMIN(64, 1 << (4 + (size_id << 1)));
++ if (size_id > 1) {
++ scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
++ next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
++ sl->sl_dc[size_id - 2][matrix_id] = next_coef;
++ }
++ for (i = 0; i < coef_num; i++) {
++ if (size_id == 0)
++ pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
++ ff_hevc_rpi_diag_scan4x4_x[i];
++ else
++ pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
++ ff_hevc_rpi_diag_scan8x8_x[i];
++
++ scaling_list_delta_coef = get_se_golomb(gb);
++ next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
++ sl->sl[size_id][matrix_id][pos] = next_coef;
++ }
++ }
++ }
++
++ if (sps->chroma_format_idc == 3) {
++ for (i = 0; i < 64; i++) {
++ sl->sl[3][1][i] = sl->sl[2][1][i];
++ sl->sl[3][2][i] = sl->sl[2][2][i];
++ sl->sl[3][4][i] = sl->sl[2][4][i];
++ sl->sl[3][5][i] = sl->sl[2][5][i];
++ }
++ sl->sl_dc[1][1] = sl->sl_dc[0][1];
++ sl->sl_dc[1][2] = sl->sl_dc[0][2];
++ sl->sl_dc[1][4] = sl->sl_dc[0][4];
++ sl->sl_dc[1][5] = sl->sl_dc[0][5];
++ }
++
++
++ return 0;
++}
++
++static int map_pixel_format(HEVCRpiSPS * const sps)
++{
++ const int cfmt = sps->chroma_format_idc;
++
++ sps->pix_fmt = AV_PIX_FMT_NONE;
++ switch (sps->bit_depth) {
++ case 8:
++ if (cfmt == 1)
++ sps->pix_fmt = AV_PIX_FMT_SAND128;
++ break;
++ case 10:
++ if (cfmt == 1)
++ sps->pix_fmt = AV_PIX_FMT_SAND64_10;
++ break;
++ default:
++ break;
++ }
++
++ sps->hshift[0] = sps->vshift[0] = 0;
++ sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
++ sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
++
++ sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
++
++ return 0;
++}
++
++static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
++ const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
++{
++ HEVCRpiWindow *ow;
++ int ret = 0;
++ int log2_diff_max_min_transform_block_size;
++ int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
++ int i;
++
++ // Coded parameters
++
++ sps->vps_id = get_bits(gb, 4);
++ if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (vps_list && !vps_list[sps->vps_id]) {
++ av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
++ sps->vps_id);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->max_sub_layers = get_bits(gb, 3) + 1;
++ if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++ av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
++ sps->max_sub_layers);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->temporal_id_nesting_flag = get_bits(gb, 1);
++
++ if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
++ return ret;
++
++ *sps_id = get_ue_golomb_long(gb);
++ if (*sps_id >= HEVC_MAX_SPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->chroma_format_idc = get_ue_golomb_long(gb);
++ if (sps->chroma_format_idc > 3U) {
++ av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->chroma_format_idc == 3)
++ sps->separate_colour_plane_flag = get_bits1(gb);
++
++ if (sps->separate_colour_plane_flag)
++ sps->chroma_format_idc = 0;
++
++ sps->width = get_ue_golomb_long(gb);
++ sps->height = get_ue_golomb_long(gb);
++ if ((ret = av_image_check_size(sps->width,
++ sps->height, 0, avctx)) < 0)
++ return ret;
++
++ if (get_bits1(gb)) { // pic_conformance_flag
++ int vert_mult = 1 + (sps->chroma_format_idc < 2);
++ int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++ sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult;
++ sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult;
++ sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult;
++ sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult;
++
++ if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++ av_log(avctx, AV_LOG_DEBUG,
++ "discarding sps conformance window, "
++ "original values are l:%u r:%u t:%u b:%u\n",
++ sps->pic_conf_win.left_offset,
++ sps->pic_conf_win.right_offset,
++ sps->pic_conf_win.top_offset,
++ sps->pic_conf_win.bottom_offset);
++
++ sps->pic_conf_win.left_offset =
++ sps->pic_conf_win.right_offset =
++ sps->pic_conf_win.top_offset =
++ sps->pic_conf_win.bottom_offset = 0;
++ }
++ sps->output_window = sps->pic_conf_win;
++ }
++
++ sps->bit_depth = get_ue_golomb_long(gb) + 8;
++ bit_depth_chroma = get_ue_golomb_long(gb) + 8;
++ if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Luma bit depth (%d) is different from chroma bit depth (%d), "
++ "this is unsupported.\n",
++ sps->bit_depth, bit_depth_chroma);
++ return AVERROR_INVALIDDATA;
++ }
++
++ ret = map_pixel_format(sps);
++ if (ret < 0)
++ return ret;
++
++ sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
++ if (sps->log2_max_poc_lsb > 16) {
++ av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
++ sps->log2_max_poc_lsb - 4);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sublayer_ordering_info = get_bits1(gb);
++ start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
++ for (i = start; i < sps->max_sub_layers; i++) {
++ sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
++ sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb);
++ sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1;
++ if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
++ av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
++ sps->temporal_layer[i].max_dec_pic_buffering - 1U);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
++ av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
++ sps->temporal_layer[i].num_reorder_pics);
++ if (avctx->err_recognition & AV_EF_EXPLODE ||
++ sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
++ return AVERROR_INVALIDDATA;
++ }
++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
++ }
++ }
++
++ if (!sublayer_ordering_info) {
++ for (i = 0; i < start; i++) {
++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
++ sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics;
++ sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase;
++ }
++ }
++
++ sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3;
++ sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
++ sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2;
++ log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb);
++ sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size +
++ sps->log2_min_tb_size;
++
++ if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->log2_diff_max_min_coding_block_size > 30) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ {
++ const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
++ // Not a bitstream limitation, but all profiles
++ if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ // Inferred parameters
++ sps->log2_ctb_size = CtbLog2SizeY;
++// sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
++ }
++
++ sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
++ sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
++
++ sps->scaling_list_enable_flag = get_bits1(gb);
++ if (sps->scaling_list_enable_flag) {
++ set_default_scaling_list_data(&sps->scaling_list);
++
++ if (get_bits1(gb)) {
++ ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
++ if (ret < 0)
++ return ret;
++ }
++ }
++
++ sps->amp_enabled_flag = get_bits1(gb);
++ sps->sao_enabled = get_bits1(gb);
++
++ // Set pcm defaults (0) so we don't have to test _enabled when we
++ // want to use them
++ memset(&sps->pcm, 0, sizeof(sps->pcm));
++
++ if (get_bits1(gb)) // pcm_enabled_flag
++ {
++ const unsigned int limit_max_pcm = FFMIN(5,
++ sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
++ sps->pcm.bit_depth = get_bits(gb, 4) + 1;
++ sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
++ sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
++ sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
++ get_ue_golomb_long(gb);
++ if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
++ av_log(avctx, AV_LOG_ERROR,
++ "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
++ sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
++ sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
++ av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
++ sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->pcm.loop_filter_disable_flag = get_bits1(gb);
++ }
++
++ // Could be based on min_pcm_cb_size but much easier logic if we just stick
++ // with 8 (and costs us little)
++ sps->pcm_width = (sps->width + 63) >> 6; // 8 for min size, 8 bits per byte - round up
++ sps->pcm_height = (sps->height + 7) >> 3;
++
++ sps->nb_st_rps = get_ue_golomb_long(gb);
++ if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
++ av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
++ sps->nb_st_rps);
++ return AVERROR_INVALIDDATA;
++ }
++ for (i = 0; i < sps->nb_st_rps; i++) {
++ if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
++ sps, 0)) < 0)
++ return ret;
++ }
++
++ sps->long_term_ref_pics_present_flag = get_bits1(gb);
++ if (sps->long_term_ref_pics_present_flag) {
++ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
++ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
++ sps->num_long_term_ref_pics_sps);
++ return AVERROR_INVALIDDATA;
++ }
++ for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
++ sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb);
++ sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
++ }
++ }
++
++ sps->sps_temporal_mvp_enabled_flag = get_bits1(gb);
++ sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
++ sps->vui.sar = (AVRational){0, 1};
++ vui_present = get_bits1(gb);
++ if (vui_present)
++ decode_vui(gb, avctx, apply_defdispwin, sps);
++
++ if (get_bits1(gb)) { // sps_extension_flag
++ int sps_extension_flag[1];
++ for (i = 0; i < 1; i++)
++ sps_extension_flag[i] = get_bits1(gb);
++ skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
++ if (sps_extension_flag[0]) {
++ int extended_precision_processing_flag;
++ int cabac_bypass_alignment_enabled_flag;
++
++ sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
++ sps->transform_skip_context_enabled_flag = get_bits1(gb);
++ sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
++
++ sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
++
++ extended_precision_processing_flag = get_bits1(gb);
++ if (extended_precision_processing_flag)
++ av_log(avctx, AV_LOG_WARNING,
++ "extended_precision_processing_flag not yet implemented\n");
++
++ if (get_bits1(gb)) // sps->intra_smoothing_disabled_flag
++ sps->intra_filters_disable |= FILTER_EITHER;
++ sps->high_precision_offsets_enabled_flag = get_bits1(gb);
++ sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
++
++ cabac_bypass_alignment_enabled_flag = get_bits1(gb);
++ if (cabac_bypass_alignment_enabled_flag)
++ av_log(avctx, AV_LOG_WARNING,
++ "cabac_bypass_alignment_enabled_flag not yet implemented\n");
++ }
++ }
++ if (apply_defdispwin) {
++ sps->output_window.left_offset += sps->vui.def_disp_win.left_offset;
++ sps->output_window.right_offset += sps->vui.def_disp_win.right_offset;
++ sps->output_window.top_offset += sps->vui.def_disp_win.top_offset;
++ sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
++ }
++
++ ow = &sps->output_window;
++ if (ow->left_offset >= INT_MAX - ow->right_offset ||
++ ow->top_offset >= INT_MAX - ow->bottom_offset ||
++ ow->left_offset + ow->right_offset >= sps->width ||
++ ow->top_offset + ow->bottom_offset >= sps->height) {
++ av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
++ ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
++ if (avctx->err_recognition & AV_EF_EXPLODE) {
++ return AVERROR_INVALIDDATA;
++ }
++ av_log(avctx, AV_LOG_WARNING,
++ "Displaying the whole video surface.\n");
++ memset(ow, 0, sizeof(*ow));
++ memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
++ }
++
++ // Inferred parameters
++
++ sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++ sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++ sps->ctb_size = sps->ctb_width * sps->ctb_height;
++
++ sps->min_cb_width = sps->width >> sps->log2_min_cb_size;
++ sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
++ sps->min_tb_width = sps->width >> sps->log2_min_tb_size;
++ sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
++ sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE;
++ sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
++ sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
++
++ sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
++ sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
++
++ if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
++ av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
++ sps->max_transform_hierarchy_depth_inter);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
++ sps->max_transform_hierarchy_depth_intra);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
++ av_log(avctx, AV_LOG_ERROR,
++ "max transform block size out of range: %d\n",
++ sps->log2_max_trafo_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (get_bits_left(gb) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Overread SPS by %d bits\n", -get_bits_left(gb));
++ return AVERROR_INVALIDDATA;
++ }
++
++ return 0;
++}
++
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps, int apply_defdispwin)
++{
++ HEVCRpiSPS *sps;
++ AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
++ unsigned int sps_id;
++ int ret;
++ ptrdiff_t nal_size;
++
++ if (!sps_buf)
++ return AVERROR(ENOMEM);
++ sps = (HEVCRpiSPS*)sps_buf->data;
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
++
++ nal_size = gb->buffer_end - gb->buffer;
++ if (nal_size > sizeof(sps->data)) {
++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++ nal_size, sizeof(sps->data));
++ sps->data_size = sizeof(sps->data);
++ } else {
++ sps->data_size = nal_size;
++ }
++ memcpy(sps->data, gb->buffer, sps->data_size);
++
++ ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
++ apply_defdispwin,
++ ps->vps_list, avctx);
++ if (ret < 0) {
++ av_buffer_unref(&sps_buf);
++ return ret;
++ }
++
++ if (avctx->debug & FF_DEBUG_BITSTREAM) {
++ av_log(avctx, AV_LOG_DEBUG,
++ "Parsed SPS: id %d; coded wxh: %dx%d; "
++ "cropped wxh: %dx%d; pix_fmt: %s.\n",
++ sps_id, sps->width, sps->height,
++ sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
++ sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
++ av_get_pix_fmt_name(sps->pix_fmt));
++ }
++
++ /* check if this is a repeat of an already parsed SPS, then keep the
++ * original one.
++ * otherwise drop all PPSes that depend on it */
++ if (ps->sps_list[sps_id] &&
++ !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
++ av_buffer_unref(&sps_buf);
++ } else {
++ remove_sps(ps, sps_id);
++ ps->sps_list[sps_id] = sps_buf;
++ }
++
++ return 0;
++}
++
++static void hevc_pps_free(void *opaque, uint8_t *data)
++{
++ HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
++
++ av_freep(&pps->column_width);
++ av_freep(&pps->row_height);
++ av_freep(&pps->col_bd);
++ av_freep(&pps->row_bd);
++ av_freep(&pps->col_idxX);
++ av_freep(&pps->ctb_addr_rs_to_ts);
++ av_freep(&pps->ctb_addr_ts_to_rs);
++ av_freep(&pps->tile_pos_ts);
++ av_freep(&pps->tile_size);
++ av_freep(&pps->tile_id);
++ av_freep(&pps->ctb_ts_flags);
++
++ av_freep(&pps);
++}
++
++static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
++{
++ do
++ {
++ const int offset = get_se_golomb_long(gb);
++ if (offset < -12 || offset > 12) {
++ av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
++ return AVERROR_INVALIDDATA;
++ }
++ *offsets++ = offset;
++ } while (n_minus_1-- != 0);
++ return 0;
++}
++
++static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++ if (pps->transform_skip_enabled_flag) {
++ pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
++ }
++ pps->cross_component_prediction_enabled_flag = get_bits1(gb);
++ if (pps->cross_component_prediction_enabled_flag &&
++ (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
++ {
++ av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
++ return AVERROR_INVALIDDATA;
++ }
++ pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
++ if (pps->chroma_qp_offset_list_enabled_flag) {
++ int err;
++
++ pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
++ pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
++ if (pps->chroma_qp_offset_list_len_minus1 > 5) {
++ av_log(avctx, AV_LOG_ERROR,
++ "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
++ return AVERROR_INVALIDDATA;
++ }
++ av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
++
++ if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
++ (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
++ return err;
++ }
++
++ {
++ const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
++
++ pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
++ if (pps->log2_sao_offset_scale_luma > max_offset) {
++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
++ return AVERROR_INVALIDDATA;
++ }
++ pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
++ if (pps->log2_sao_offset_scale_chroma > max_offset) {
++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ return(0);
++}
++
++static inline int setup_pps(AVCodecContext * const avctx,
++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++ int pic_area_in_ctbs;
++ int i, j, x, y, ctb_addr_rs, tile_id;
++
++ // Inferred parameters
++
++ // qp_y -> qp_u/qp_v tables
++ // The tables have at least -24,+24 overrun after adding offset here
++ // which should allow for clipless offseting
++
++ pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0; // No offset for luma, but may be useful for general code
++ pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
++
++ if (sps->chroma_format_idc == 1) {
++ pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++ pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++ }
++ else
++ {
++ pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++ pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++ }
++
++ pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
++ pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd));
++ pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX));
++ if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
++ return AVERROR(ENOMEM);
++
++ if (pps->uniform_spacing_flag) {
++ if (!pps->column_width) {
++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height));
++ }
++ if (!pps->column_width || !pps->row_height)
++ return AVERROR(ENOMEM);
++
++ for (i = 0; i < pps->num_tile_columns; i++) {
++ pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
++ (i * sps->ctb_width) / pps->num_tile_columns;
++ }
++
++ for (i = 0; i < pps->num_tile_rows; i++) {
++ pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
++ (i * sps->ctb_height) / pps->num_tile_rows;
++ }
++ }
++
++ {
++ const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
++ pps->col_bd[0] = 0;
++ pps->tile_wpp_inter_disable = 0;
++ for (i = 0; i < pps->num_tile_columns; i++)
++ {
++ pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
++
++ // Avoid trying tile parallel if the columns don't fall on cache boundries
++ // (this causes too much pain syncing flushes with the QPU)
++ // Ignore the final (RHS of pic) tile boundry
++ if ((pps->col_bd[i] & td_mask) != 0) {
++ pps->tile_wpp_inter_disable = 1;
++ }
++ }
++
++ // If we can start the next row before finishing the first line of
++ // this one then we must wait at the end of the tile
++ // * if this happens a lot then there are better but more complicated
++ // conditions that we could apply
++ if (pps->tile_wpp_inter_disable) {
++ for (i = 0; i < pps->num_tile_rows; i++)
++ {
++ if (pps->row_height[i] <= RPI_MAX_JOBS) {
++ pps->tile_wpp_inter_disable = 2;
++ break;
++ }
++ }
++ }
++ }
++
++ pps->row_bd[0] = 0;
++ for (i = 0; i < pps->num_tile_rows; i++)
++ pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
++
++ for (i = 0, j = 0; i < sps->ctb_width; i++) {
++ if (i >= pps->col_bd[j + 1])
++ j++;
++ pps->col_idxX[i] = j;
++ }
++
++ /**
++ * 6.5
++ */
++ pic_area_in_ctbs = sps->ctb_size;
++
++ pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts));
++ pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs));
++ pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id));
++ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
++ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
++ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags));
++ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
++ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
++ return AVERROR(ENOMEM);
++ }
++
++ memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
++
++ for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
++ int tb_x = ctb_addr_rs % sps->ctb_width;
++ int tb_y = ctb_addr_rs / sps->ctb_width;
++ int tile_x = 0;
++ int tile_y = 0;
++ int val = 0;
++
++ for (i = 0; i < pps->num_tile_columns; i++) {
++ if (tb_x < pps->col_bd[i + 1]) {
++ tile_x = i;
++ break;
++ }
++ }
++
++ for (i = 0; i < pps->num_tile_rows; i++) {
++ if (tb_y < pps->row_bd[i + 1]) {
++ tile_y = i;
++ break;
++ }
++ }
++
++ for (i = 0; i < tile_x; i++)
++ val += pps->row_height[tile_y] * pps->column_width[i];
++ for (i = 0; i < tile_y; i++)
++ val += sps->ctb_width * pps->row_height[i];
++
++ val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
++ tb_x - pps->col_bd[tile_x];
++
++ pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
++ pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs;
++ }
++
++ {
++ uint8_t * pflags = pps->ctb_ts_flags;
++ uint16_t * ptid = pps->tile_id;
++
++ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
++ {
++ for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
++ {
++ const unsigned int tile_w = pps->column_width[i];
++
++ pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++ for (x = 0; x != tile_w; ++x) {
++ pflags[x] |= CTB_TS_FLAGS_TOT;
++ }
++
++ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
++ {
++ pflags[0] |= CTB_TS_FLAGS_SOTL;
++
++ if (pps->entropy_coding_sync_enabled_flag)
++ {
++ if (pps->column_width[i] != 1)
++ pflags[1] |= CTB_TS_FLAGS_CSAVE;
++ else
++ pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++ if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
++ pflags[0] |= CTB_TS_FLAGS_CLOAD;
++ }
++
++ for (x = 0; x != tile_w; ++x)
++ *ptid++ = tile_id;
++
++ pflags += tile_w;
++ pflags[-1] |= CTB_TS_FLAGS_EOTL;
++ if (i + 1 == pps->num_tile_columns)
++ pflags[-1] |= CTB_TS_FLAGS_EOL;
++ }
++
++ pflags[-1] |= CTB_TS_FLAGS_EOT;
++ }
++ }
++ }
++
++ {
++ unsigned int ts = 0;
++ for (j = 0; j < pps->num_tile_rows; j++)
++ for (i = 0; i < pps->num_tile_columns; i++)
++ {
++ const unsigned int size = pps->column_width[i] * pps->row_height[j];
++ pps->tile_size[j * pps->num_tile_columns + i] = size;
++ pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
++ ts += size;
++ }
++ }
++
++ return 0;
++}
++
++int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
++ HEVCRpiParamSets * const ps)
++{
++ const HEVCRpiSPS *sps = NULL;
++ int i, ret = 0;
++ unsigned int pps_id = 0;
++ ptrdiff_t nal_size;
++ unsigned log2_parallel_merge_level_minus2;
++
++ AVBufferRef *pps_buf;
++ HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
++
++ if (!pps)
++ return AVERROR(ENOMEM);
++
++ pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
++ hevc_pps_free, NULL, 0);
++ if (!pps_buf) {
++ av_freep(&pps);
++ return AVERROR(ENOMEM);
++ }
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
++
++ nal_size = gb->buffer_end - gb->buffer;
++ if (nal_size > sizeof(pps->data)) {
++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++ nal_size, sizeof(pps->data));
++ pps->data_size = sizeof(pps->data);
++ } else {
++ pps->data_size = nal_size;
++ }
++ memcpy(pps->data, gb->buffer, pps->data_size);
++
++ // Default values
++ pps->loop_filter_across_tiles_enabled_flag = 1;
++ pps->num_tile_columns = 1;
++ pps->num_tile_rows = 1;
++ pps->uniform_spacing_flag = 1;
++ pps->disable_dbf = 0;
++ pps->beta_offset = 0;
++ pps->tc_offset = 0;
++ pps->log2_max_transform_skip_block_size = 2;
++
++ // Coded parameters
++ pps_id = get_ue_golomb_long(gb);
++ if (pps_id >= HEVC_MAX_PPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->sps_id = get_ue_golomb_long(gb);
++ if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ if (!ps->sps_list[pps->sps_id]) {
++ av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
++
++ pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
++ pps->output_flag_present_flag = get_bits1(gb);
++ pps->num_extra_slice_header_bits = get_bits(gb, 3);
++
++ pps->sign_data_hiding_flag = get_bits1(gb);
++
++ pps->cabac_init_present_flag = get_bits1(gb);
++
++ pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
++ if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
++ if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->pic_init_qp_minus26 = get_se_golomb(gb);
++ if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
++ av_log(avctx, AV_LOG_ERROR,
++ "init_qp_minus26 %d is outside the valid range "
++ "[%d, %d].\n",
++ pps->pic_init_qp_minus26,
++ -(26 + sps->qp_bd_offset), 25);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->constrained_intra_pred_flag = get_bits1(gb);
++ pps->transform_skip_enabled_flag = get_bits1(gb);
++
++ pps->cu_qp_delta_enabled_flag = get_bits1(gb);
++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
++ if (pps->cu_qp_delta_enabled_flag)
++ {
++ const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
++
++ if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
++ av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
++ diff_cu_qp_delta_depth);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
++ }
++
++ pps->cb_qp_offset = get_se_golomb(gb);
++ if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
++ av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
++ pps->cb_qp_offset);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->cr_qp_offset = get_se_golomb(gb);
++ if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
++ av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
++ pps->cr_qp_offset);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
++
++ pps->weighted_pred_flag = get_bits1(gb);
++ pps->weighted_bipred_flag = get_bits1(gb);
++
++ pps->transquant_bypass_enable_flag = get_bits1(gb);
++ pps->tiles_enabled_flag = get_bits1(gb);
++ pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
++
++ if (pps->tiles_enabled_flag) {
++ pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
++ pps->num_tile_rows = get_ue_golomb_long(gb) + 1;
++ if (pps->num_tile_columns <= 0 ||
++ pps->num_tile_columns >= sps->width) {
++ av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
++ pps->num_tile_columns - 1);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ if (pps->num_tile_rows <= 0 ||
++ pps->num_tile_rows >= sps->height) {
++ av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
++ pps->num_tile_rows - 1);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height));
++ if (!pps->column_width || !pps->row_height) {
++ ret = AVERROR(ENOMEM);
++ goto err;
++ }
++
++ pps->uniform_spacing_flag = get_bits1(gb);
++ if (!pps->uniform_spacing_flag) {
++ uint64_t sum = 0;
++ for (i = 0; i < pps->num_tile_columns - 1; i++) {
++ pps->column_width[i] = get_ue_golomb_long(gb) + 1;
++ sum += pps->column_width[i];
++ }
++ if (sum >= sps->ctb_width) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
++
++ sum = 0;
++ for (i = 0; i < pps->num_tile_rows - 1; i++) {
++ pps->row_height[i] = get_ue_golomb_long(gb) + 1;
++ sum += pps->row_height[i];
++ }
++ if (sum >= sps->ctb_height) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
++ }
++ pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
++ }
++
++ pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++
++ pps->deblocking_filter_control_present_flag = get_bits1(gb);
++ if (pps->deblocking_filter_control_present_flag) {
++ pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
++ pps->disable_dbf = get_bits1(gb);
++ if (!pps->disable_dbf) {
++ int beta_offset_div2 = get_se_golomb(gb);
++ int tc_offset_div2 = get_se_golomb(gb) ;
++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
++ av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
++ beta_offset_div2);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++ av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
++ tc_offset_div2);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->beta_offset = 2 * beta_offset_div2;
++ pps->tc_offset = 2 * tc_offset_div2;
++ }
++ }
++
++ pps->scaling_list_data_present_flag = get_bits1(gb);
++ if (pps->scaling_list_data_present_flag) {
++ set_default_scaling_list_data(&pps->scaling_list);
++ ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
++ if (ret < 0)
++ goto err;
++ }
++ pps->lists_modification_present_flag = get_bits1(gb);
++ log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb);
++ if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
++ av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
++ log2_parallel_merge_level_minus2);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2;
++
++ pps->slice_header_extension_present_flag = get_bits1(gb);
++
++ if (get_bits1(gb)) { // pps_extension_present_flag
++ int pps_range_extensions_flag = get_bits1(gb);
++ skip_bits(gb, 7); // pps_extension_7bits
++ if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
++ if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
++ goto err;
++ }
++ }
++
++ ret = setup_pps(avctx, pps, sps);
++ if (ret < 0)
++ goto err;
++
++ if (get_bits_left(gb) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Overread PPS by %d bits\n", -get_bits_left(gb));
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ remove_pps(ps, pps_id);
++ ps->pps_list[pps_id] = pps_buf;
++
++ return 0;
++
++err:
++ av_buffer_unref(&pps_buf);
++ return ret;
++}
++
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
++{
++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
++ int prev_poc_lsb = pocTid0 % max_poc_lsb;
++ int prev_poc_msb = pocTid0 - prev_poc_lsb;
++ int poc_msb;
++
++ if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
++ poc_msb = prev_poc_msb + max_poc_lsb;
++ else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
++ poc_msb = prev_poc_msb - max_poc_lsb;
++ else
++ poc_msb = prev_poc_msb;
++
++ // For BLA picture types, POCmsb is set to 0.
++ if (nal_unit_type == HEVC_NAL_BLA_W_LP ||
++ nal_unit_type == HEVC_NAL_BLA_W_RADL ||
++ nal_unit_type == HEVC_NAL_BLA_N_LP)
++ poc_msb = 0;
++
++ return poc_msb + poc_lsb;
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.h
+@@ -0,0 +1,449 @@
++/*
++ * HEVC parameter set parsing
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PS_H
++#define AVCODEC_RPI_HEVC_PS_H
++
++#include <stdint.h>
++
++#include "libavutil/buffer.h"
++#include "libavutil/pixfmt.h"
++#include "libavutil/rational.h"
++
++#include "avcodec.h"
++#include "get_bits.h"
++#include "hevc.h"
++
++typedef struct ShortTermRPS {
++ unsigned int num_negative_pics;
++ int num_delta_pocs;
++ int rps_idx_num_delta_pocs;
++ int32_t delta_poc[32];
++ uint8_t used[32];
++} ShortTermRPS;
++
++typedef struct LongTermRPS {
++ int poc[32];
++ uint8_t used[32];
++ uint8_t nb_refs;
++} LongTermRPS;
++
++typedef struct RpiSliceHeader {
++ unsigned int pps_id;
++
++ ///< address (in raster order) of the first block in the current slice segment
++ unsigned int slice_segment_addr;
++ ///< address (in raster order) of the first block in the current slice
++ unsigned int slice_addr;
++
++ enum HEVCSliceType slice_type;
++
++ int pic_order_cnt_lsb;
++
++ uint8_t first_slice_in_pic_flag;
++ uint8_t dependent_slice_segment_flag;
++ uint8_t pic_output_flag;
++ uint8_t colour_plane_id;
++
++ ///< RPS coded in the slice header itself is stored here
++ int short_term_ref_pic_set_sps_flag;
++ int short_term_ref_pic_set_size;
++ ShortTermRPS slice_rps;
++ const ShortTermRPS *short_term_rps;
++ int long_term_ref_pic_set_size;
++ LongTermRPS long_term_rps;
++ unsigned int list_entry_lx[2][32];
++
++ uint8_t rpl_modification_flag[2];
++ uint8_t no_output_of_prior_pics_flag;
++ uint8_t slice_temporal_mvp_enabled_flag;
++
++ unsigned int nb_refs[2];
++
++ uint8_t slice_sample_adaptive_offset_flag[3];
++ uint8_t mvd_l1_zero_flag;
++
++ uint8_t cabac_init_flag;
++ uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
++ uint8_t slice_loop_filter_across_slices_enabled_flag;
++ uint8_t collocated_list;
++
++ uint8_t no_dblk_boundary_flags;
++
++ unsigned int collocated_ref_idx;
++
++ int slice_qp_delta;
++ int slice_cb_qp_offset; // -12, +12
++ int slice_cr_qp_offset; // -12, +12
++
++ uint8_t cu_chroma_qp_offset_enabled_flag;
++
++ int beta_offset; ///< beta_offset_div2 * 2
++ int tc_offset; ///< tc_offset_div2 * 2
++
++ unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
++
++ unsigned *entry_point_offset;
++ int * offset;
++ int * size;
++ int num_entry_point_offsets;
++ int offsets_allocated;
++
++ uint8_t offload_wpp;
++ uint8_t offload_tiles;
++
++ int8_t slice_qp;
++
++ uint8_t luma_log2_weight_denom;
++ uint8_t chroma_log2_weight_denom;
++
++ int16_t luma_weight_l0[16]; // -128, +255
++ int16_t luma_offset_l0[16];
++ int16_t chroma_weight_l0[16][2];
++ int16_t chroma_offset_l0[16][2];
++
++ int16_t luma_weight_l1[16];
++ int16_t luma_offset_l1[16];
++ int16_t chroma_weight_l1[16][2];
++ int16_t chroma_offset_l1[16][2];
++
++} RpiSliceHeader;
++
++typedef struct HEVCRpiWindow {
++ uint16_t left_offset;
++ uint16_t right_offset;
++ uint16_t top_offset;
++ uint16_t bottom_offset;
++} HEVCRpiWindow;
++
++typedef struct VUI {
++ AVRational sar;
++
++ int overscan_info_present_flag;
++ int overscan_appropriate_flag;
++
++ int video_signal_type_present_flag;
++ int video_format;
++ int video_full_range_flag;
++ int colour_description_present_flag;
++ uint8_t colour_primaries;
++ uint8_t transfer_characteristic;
++ uint8_t matrix_coeffs;
++
++ int chroma_loc_info_present_flag;
++ int chroma_sample_loc_type_top_field;
++ int chroma_sample_loc_type_bottom_field;
++ int neutra_chroma_indication_flag;
++
++ int field_seq_flag;
++ int frame_field_info_present_flag;
++
++ int default_display_window_flag;
++ HEVCRpiWindow def_disp_win;
++
++ int vui_timing_info_present_flag;
++ uint32_t vui_num_units_in_tick;
++ uint32_t vui_time_scale;
++ int vui_poc_proportional_to_timing_flag;
++ int vui_num_ticks_poc_diff_one_minus1;
++ int vui_hrd_parameters_present_flag;
++
++ int bitstream_restriction_flag;
++ int tiles_fixed_structure_flag;
++ int motion_vectors_over_pic_boundaries_flag;
++ int restricted_ref_pic_lists_flag;
++ int min_spatial_segmentation_idc;
++ int max_bytes_per_pic_denom;
++ int max_bits_per_min_cu_denom;
++ int log2_max_mv_length_horizontal;
++ int log2_max_mv_length_vertical;
++} VUI;
++
++typedef struct PTLCommon {
++ uint8_t profile_space;
++ uint8_t tier_flag;
++ uint8_t profile_idc;
++ uint8_t profile_compatibility_flag[32];
++ uint8_t level_idc;
++ uint8_t progressive_source_flag;
++ uint8_t interlaced_source_flag;
++ uint8_t non_packed_constraint_flag;
++ uint8_t frame_only_constraint_flag;
++} PTLCommon;
++
++typedef struct PTL {
++ PTLCommon general_ptl;
++ PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
++
++ uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
++ uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
++} PTL;
++
++typedef struct HEVCRpiVPS {
++ uint8_t vps_temporal_id_nesting_flag;
++ int vps_max_layers;
++ int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
++
++ PTL ptl;
++ int vps_sub_layer_ordering_info_present_flag;
++ unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
++ unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
++ unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
++ int vps_max_layer_id;
++ int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
++ uint8_t vps_timing_info_present_flag;
++ uint32_t vps_num_units_in_tick;
++ uint32_t vps_time_scale;
++ uint8_t vps_poc_proportional_to_timing_flag;
++ int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
++ int vps_num_hrd_parameters;
++
++ uint8_t data[4096];
++ int data_size;
++} HEVCRpiVPS;
++
++typedef struct ScalingList {
++ /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
++ * and size ID 3 only has 2 arrays, not 6. */
++ uint8_t sl[4][6][64];
++ uint8_t sl_dc[2][6];
++} ScalingList;
++
++typedef struct HEVCRpiSPS {
++ unsigned vps_id;
++ uint8_t chroma_format_idc;
++ uint8_t separate_colour_plane_flag;
++
++ HEVCRpiWindow output_window;
++
++ HEVCRpiWindow pic_conf_win;
++
++ uint16_t wp_offset_half_range; // WpOffsetHalfRange
++
++ uint8_t bit_depth;
++
++// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth
++ uint8_t pixel_shift;
++ enum AVPixelFormat pix_fmt;
++
++ unsigned int log2_max_poc_lsb;
++
++ int max_sub_layers;
++ struct {
++ int max_dec_pic_buffering;
++ int num_reorder_pics;
++ int max_latency_increase;
++ } temporal_layer[HEVC_MAX_SUB_LAYERS];
++ uint8_t temporal_id_nesting_flag;
++
++ uint8_t scaling_list_enable_flag;
++ ScalingList scaling_list;
++
++ unsigned int nb_st_rps;
++ ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
++
++ uint8_t amp_enabled_flag;
++ uint8_t sao_enabled;
++
++ uint8_t long_term_ref_pics_present_flag;
++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
++ uint8_t num_long_term_ref_pics_sps;
++
++ struct {
++ uint8_t bit_depth;
++ uint8_t bit_depth_chroma;
++ uint8_t log2_min_pcm_cb_size;
++ uint8_t log2_max_pcm_cb_size;
++ uint8_t loop_filter_disable_flag;
++ } pcm;
++ char sps_temporal_mvp_enabled_flag;
++// char sps_strong_intra_smoothing_enable_flag; -> intra_filtes_disable
++
++ uint8_t log2_min_cb_size; // 3..6
++ uint8_t log2_diff_max_min_coding_block_size;
++ uint8_t log2_min_tb_size; // 2..5
++ uint8_t log2_max_trafo_size;
++ uint8_t log2_ctb_size; // 4..6
++// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1)
++#define LOG2_MIN_PU_SIZE 2
++#define LOG2_MIN_CU_SIZE 3
++
++ uint8_t max_transform_hierarchy_depth_inter;
++ uint8_t max_transform_hierarchy_depth_intra;
++
++ char transform_skip_rotation_enabled_flag;
++ char transform_skip_context_enabled_flag;
++ char implicit_rdpcm_enabled_flag;
++ char explicit_rdpcm_enabled_flag;
++// char intra_smoothing_disabled_flag; -> intra_filtes_disable
++ char high_precision_offsets_enabled_flag;
++ char persistent_rice_adaptation_enabled_flag;
++
++ uint8_t intra_filters_disable;
++
++ ///< coded frame dimension in various units
++ int width;
++ int height;
++ int ctb_width;
++ int ctb_height;
++ int ctb_size; // Pic size in CTBs not size of a CTB
++ int min_cb_width;
++ int min_cb_height;
++ int min_tb_width;
++ int min_tb_height;
++ int min_pu_width;
++ int min_pu_height;
++ int pcm_width;
++ int pcm_height;
++ int tb_mask;
++
++ int hshift[3];
++ int vshift[3];
++
++ int qp_bd_offset;
++
++ uint8_t data[4096];
++ int data_size;
++
++ VUI vui;
++ PTL ptl;
++} HEVCRpiSPS;
++
++#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line
++#define CTB_TS_FLAGS_EOTL (1U << 1) // Last CTB of a tile line
++#define CTB_TS_FLAGS_EOL (1U << 2) // Last CTB of a complete line
++#define CTB_TS_FLAGS_EOT (1U << 3) // Last CTB of a tile
++#define CTB_TS_FLAGS_CSAVE (1U << 4)
++#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request
++#define CTB_TS_FLAGS_TOT (1U << 6) // CTB on top row of a tile
++#define CTB_TS_FLAGS_CLOAD (1U << 7)
++
++typedef struct HEVCRpiPPS {
++ unsigned int sps_id; ///< seq_parameter_set_id
++
++ uint8_t sign_data_hiding_flag;
++
++ uint8_t cabac_init_present_flag;
++
++ int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
++ int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
++ int pic_init_qp_minus26;
++
++ uint8_t constrained_intra_pred_flag;
++ uint8_t transform_skip_enabled_flag;
++
++ uint8_t cu_qp_delta_enabled_flag;
++ uint8_t log2_min_cu_qp_delta_size;
++ int cb_qp_offset; // -12..12
++ int cr_qp_offset; // -12..12
++ const uint8_t * qp_dblk_x[3];
++ const int8_t * qp_bd_x[3];
++
++ uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
++ uint8_t weighted_pred_flag;
++ uint8_t weighted_bipred_flag;
++ uint8_t output_flag_present_flag;
++ uint8_t transquant_bypass_enable_flag;
++
++ uint8_t dependent_slice_segments_enabled_flag;
++ uint8_t tiles_enabled_flag;
++ uint8_t entropy_coding_sync_enabled_flag;
++
++ uint8_t tile_wpp_inter_disable;
++ int num_tile_columns; ///< num_tile_columns_minus1 + 1
++ int num_tile_rows; ///< num_tile_rows_minus1 + 1
++ uint8_t uniform_spacing_flag;
++ uint8_t loop_filter_across_tiles_enabled_flag;
++
++ uint8_t seq_loop_filter_across_slices_enabled_flag;
++
++ uint8_t deblocking_filter_control_present_flag;
++ uint8_t deblocking_filter_override_enabled_flag;
++ uint8_t disable_dbf;
++ int beta_offset; ///< beta_offset_div2 * 2
++ int tc_offset; ///< tc_offset_div2 * 2
++
++ uint8_t scaling_list_data_present_flag;
++ ScalingList scaling_list;
++
++ uint8_t lists_modification_present_flag;
++ int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
++ int num_extra_slice_header_bits;
++ uint8_t slice_header_extension_present_flag;
++ uint8_t log2_max_transform_skip_block_size;
++ uint8_t cross_component_prediction_enabled_flag;
++ uint8_t chroma_qp_offset_list_enabled_flag;
++ uint8_t diff_cu_chroma_qp_offset_depth;
++ uint8_t chroma_qp_offset_list_len_minus1;
++ int8_t cb_qp_offset_list[6];
++ int8_t cr_qp_offset_list[6];
++ uint8_t log2_sao_offset_scale_luma;
++ uint8_t log2_sao_offset_scale_chroma;
++
++ // Inferred parameters
++ uint16_t *column_width; ///< ColumnWidth
++ uint16_t *row_height; ///< RowHeight
++ uint16_t *col_bd; ///< ColBd
++ uint16_t *row_bd; ///< RowBd
++ uint16_t *col_idxX;
++
++ // We can limit these to uint16_t given our other size limits
++ uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
++ uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
++ uint16_t *tile_id; ///< TileId
++ uint16_t *tile_pos_ts; ///< TilePosRS
++ uint16_t *tile_size; ///< TileSize
++ uint8_t * ctb_ts_flags;
++
++ uint8_t data[4096];
++ int data_size;
++} HEVCRpiPPS;
++
++typedef struct HEVCRpiParamSets {
++ /* currently active parameter sets */
++ const HEVCRpiVPS *vps;
++ const HEVCRpiSPS *sps;
++ const HEVCRpiPPS *pps;
++
++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
++} HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps);
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps, int apply_defdispwin);
++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps);
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
++
++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
++ uint8_t *buf, int buf_size);
++
++/**
++ * Compute POC of the current frame and return it.
++ */
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
++
++#endif /* AVCODEC_RPI_HEVC_PS_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_refs.c
+@@ -0,0 +1,485 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "internal.h"
++#include "thread.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
++{
++ /* frame->frame can be NULL if context init failed */
++ if (!frame->frame || !frame->frame->buf[0])
++ return;
++
++ frame->flags &= ~flags;
++ if (!frame->flags) {
++ ff_thread_release_buffer(s->avctx, &frame->tf);
++
++ av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL
++ frame->col_mvf = NULL;
++
++ frame->collocated_ref = NULL;
++ }
++}
++
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
++{
++ int i;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i],
++ HEVC_FRAME_FLAG_SHORT_REF |
++ HEVC_FRAME_FLAG_LONG_REF);
++}
++
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
++{
++ int i;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++}
++
++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
++{
++ int i, ret;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame * const frame = &s->DPB[i];
++ if (frame->frame->buf[0])
++ continue;
++
++ ret = ff_thread_get_buffer(s->avctx, &frame->tf,
++ AV_GET_BUFFER_FLAG_REF);
++ if (ret < 0)
++ return NULL;
++
++ frame->col_mvf = NULL;
++ frame->col_mvf_buf = NULL;
++ if (s->used_for_ref && !s->is_irap)
++ {
++ frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
++ if (!frame->col_mvf_buf)
++ goto fail;
++ frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
++ }
++
++ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
++ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
++
++ return frame;
++
++fail:
++ ff_hevc_rpi_unref_frame(s, frame, ~0);
++ return NULL;
++ }
++ av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
++ return NULL;
++}
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
++{
++ HEVCRpiFrame *ref;
++ int i;
++
++ /* check that this POC doesn't already exist */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++
++ if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
++ frame->poc == poc) {
++ av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
++ poc);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ ref = alloc_frame(s);
++ if (!ref)
++ return AVERROR(ENOMEM);
++
++ *frame = ref->frame;
++ s->ref = ref;
++
++ if (s->sh.pic_output_flag)
++ ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
++ else
++ ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
++
++ ref->poc = poc;
++ ref->sequence = s->seq_decode;
++ ref->frame->crop_left = s->ps.sps->output_window.left_offset;
++ ref->frame->crop_right = s->ps.sps->output_window.right_offset;
++ ref->frame->crop_top = s->ps.sps->output_window.top_offset;
++ ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
++
++ return 0;
++}
++
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
++{
++ do {
++ int nb_output = 0;
++ int min_poc = INT_MAX;
++ int i, min_idx, ret;
++
++ if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
++ frame->sequence == s->seq_output) {
++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++ }
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
++ frame->sequence == s->seq_output) {
++ nb_output++;
++ if (frame->poc < min_poc || nb_output == 1) {
++ min_poc = frame->poc;
++ min_idx = i;
++ }
++ }
++ }
++
++ /* wait for more frames before output */
++ if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
++ nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
++ return 0;
++
++ if (nb_output) {
++ HEVCRpiFrame *frame = &s->DPB[min_idx];
++ if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
++ return 0;
++
++ ret = av_frame_ref(out, frame->frame);
++ if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
++ else
++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++ if (ret < 0)
++ return ret;
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "Output frame with POC %d.\n", frame->poc);
++ return 1;
++ }
++
++ if (s->seq_output != s->seq_decode)
++ s->seq_output = (s->seq_output + 1) & 0xff;
++ else
++ break;
++ } while (1);
++
++ return 0;
++}
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
++{
++ int dpb = 0;
++ int min_poc = INT_MAX;
++ int i;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if ((frame->flags) &&
++ frame->sequence == s->seq_output &&
++ frame->poc != s->poc) {
++ dpb++;
++ }
++ }
++
++ if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if ((frame->flags) &&
++ frame->sequence == s->seq_output &&
++ frame->poc != s->poc) {
++ if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
++ min_poc = frame->poc;
++ }
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
++ frame->sequence == s->seq_output &&
++ frame->poc <= min_poc) {
++ frame->flags |= HEVC_FRAME_FLAG_BUMPING;
++ }
++ }
++
++ dpb--;
++ }
++}
++
++static int init_slice_rpl(HEVCRpiContext *s)
++{
++ if (s->slice_idx >= s->rpl_tab_size)
++ return AVERROR_INVALIDDATA;
++
++ s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
++ return 0;
++}
++
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
++{
++ RpiSliceHeader *sh = &s->sh;
++
++ uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
++ uint8_t list_idx;
++ int i, j, ret;
++
++ ret = init_slice_rpl(s);
++ if (ret < 0)
++ return ret;
++
++ if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
++ s->rps[LT_CURR].nb_refs)) {
++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ for (list_idx = 0; list_idx < nb_list; list_idx++) {
++ RefPicList rpl_tmp = { { 0 } };
++ RefPicList *rpl = &s->refPicList[list_idx];
++
++ /* The order of the elements is
++ * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
++ * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
++ int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
++ list_idx ? ST_CURR_BEF : ST_CURR_AFT,
++ LT_CURR };
++
++ /* concatenate the candidate lists for the current frame */
++ while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
++ for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
++ RefPicList *rps = &s->rps[cand_lists[i]];
++ for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
++ rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j];
++ rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j];
++ rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
++ rpl_tmp.nb_refs++;
++ }
++ }
++ }
++
++ /* reorder the references if necessary */
++ if (sh->rpl_modification_flag[list_idx]) {
++ for (i = 0; i < sh->nb_refs[list_idx]; i++) {
++ int idx = sh->list_entry_lx[list_idx][i];
++
++ if (idx >= rpl_tmp.nb_refs) {
++ av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ rpl->list[i] = rpl_tmp.list[idx];
++ rpl->ref[i] = rpl_tmp.ref[idx];
++ rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
++ rpl->nb_refs++;
++ }
++ } else {
++ memcpy(rpl, &rpl_tmp, sizeof(*rpl));
++ rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
++ }
++
++ if (sh->collocated_list == list_idx &&
++ sh->collocated_ref_idx < rpl->nb_refs)
++ s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
++ }
++
++ return 0;
++}
++
++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
++{
++ int i;
++ int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *ref = &s->DPB[i];
++ if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
++ if ((ref->poc & LtMask) == poc)
++ return ref;
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *ref = &s->DPB[i];
++ if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
++ if (ref->poc == poc || (ref->poc & LtMask) == poc)
++ return ref;
++ }
++ }
++
++ if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Could not find ref with POC %d\n", poc);
++ return NULL;
++}
++
++static void mark_ref(HEVCRpiFrame *frame, int flag)
++{
++ frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
++ frame->flags |= flag;
++}
++
++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
++{
++ HEVCRpiFrame *frame;
++ int i, x, y;
++
++ frame = alloc_frame(s);
++ if (!frame)
++ return NULL;
++
++ if (!s->ps.sps->pixel_shift) {
++ for (i = 0; frame->frame->buf[i]; i++)
++ memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
++ frame->frame->buf[i]->size);
++ } else {
++ for (i = 0; frame->frame->data[i]; i++)
++ for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
++ for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
++ AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
++ 1 << (s->ps.sps->bit_depth - 1));
++ }
++ }
++
++ frame->poc = poc;
++ frame->sequence = s->seq_decode;
++ frame->flags = 0;
++
++ ff_hevc_rpi_progress_set_all_done(frame);
++
++ return frame;
++}
++
++/* add a reference with the given poc to the list and mark it as used in DPB */
++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
++ int poc, int ref_flag)
++{
++ HEVCRpiFrame *ref = find_ref_idx(s, poc);
++
++ if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
++ return AVERROR_INVALIDDATA;
++
++ if (!ref) {
++ ref = generate_missing_ref(s, poc);
++ if (!ref)
++ return AVERROR(ENOMEM);
++ }
++
++ list->list[list->nb_refs] = ref->poc;
++ list->ref[list->nb_refs] = ref;
++ list->nb_refs++;
++
++ mark_ref(ref, ref_flag);
++ return 0;
++}
++
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
++{
++ const ShortTermRPS *short_rps = s->sh.short_term_rps;
++ const LongTermRPS *long_rps = &s->sh.long_term_rps;
++ RefPicList *rps = s->rps;
++ int i, ret = 0;
++
++ if (!short_rps) {
++ rps[0].nb_refs = rps[1].nb_refs = 0;
++ return 0;
++ }
++
++ /* clear the reference flags on all frames except the current one */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++
++ if (frame == s->ref)
++ continue;
++
++ mark_ref(frame, 0);
++ }
++
++ for (i = 0; i < NB_RPS_TYPE; i++)
++ rps[i].nb_refs = 0;
++
++ /* add the short refs */
++ for (i = 0; i < short_rps->num_delta_pocs; i++) {
++ int poc = s->poc + short_rps->delta_poc[i];
++ int list;
++
++ if (!short_rps->used[i])
++ list = ST_FOLL;
++ else if (i < short_rps->num_negative_pics)
++ list = ST_CURR_BEF;
++ else
++ list = ST_CURR_AFT;
++
++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
++ if (ret < 0)
++ goto fail;
++ }
++
++ /* add the long refs */
++ for (i = 0; i < long_rps->nb_refs; i++) {
++ int poc = long_rps->poc[i];
++ int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
++
++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
++ if (ret < 0)
++ goto fail;
++ }
++
++fail:
++ /* release any frames that are now unused */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
++
++ return ret;
++}
++
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
++{
++ int ret = 0;
++ int i;
++ const ShortTermRPS *rps = s->sh.short_term_rps;
++ LongTermRPS *long_rps = &s->sh.long_term_rps;
++
++ if (rps) {
++ for (i = 0; i < rps->num_negative_pics; i++)
++ ret += !!rps->used[i];
++ for (; i < rps->num_delta_pocs; i++)
++ ret += !!rps->used[i];
++ }
++
++ if (long_rps) {
++ for (i = 0; i < long_rps->nb_refs; i++)
++ ret += !!long_rps->used[i];
++ }
++ return ret;
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.c
+@@ -0,0 +1,368 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "golomb.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
++{
++ int cIdx, i;
++ uint8_t hash_type;
++ //uint16_t picture_crc;
++ //uint32_t picture_checksum;
++ hash_type = get_bits(gb, 8);
++
++ for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
++ if (hash_type == 0) {
++ s->is_md5 = 1;
++ for (i = 0; i < 16; i++)
++ s->md5[cIdx][i] = get_bits(gb, 8);
++ } else if (hash_type == 1) {
++ // picture_crc = get_bits(gb, 16);
++ skip_bits(gb, 16);
++ } else if (hash_type == 2) {
++ // picture_checksum = get_bits_long(gb, 32);
++ skip_bits(gb, 32);
++ }
++ }
++ return 0;
++}
++
++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
++{
++ int i;
++ // Mastering primaries
++ for (i = 0; i < 3; i++) {
++ s->display_primaries[i][0] = get_bits(gb, 16);
++ s->display_primaries[i][1] = get_bits(gb, 16);
++ }
++ // White point (x, y)
++ s->white_point[0] = get_bits(gb, 16);
++ s->white_point[1] = get_bits(gb, 16);
++
++ // Max and min luminance of mastering display
++ s->max_luminance = get_bits_long(gb, 32);
++ s->min_luminance = get_bits_long(gb, 32);
++
++ // As this SEI message comes before the first frame that references it,
++ // initialize the flag to 2 and decrement on IRAP access unit so it
++ // persists for the coded video sequence (e.g., between two IRAPs)
++ s->present = 2;
++ return 0;
++}
++
++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
++{
++ // Max and average light levels
++ s->max_content_light_level = get_bits_long(gb, 16);
++ s->max_pic_average_light_level = get_bits_long(gb, 16);
++ // As this SEI message comes before the first frame that references it,
++ // initialize the flag to 2 and decrement on IRAP access unit so it
++ // persists for the coded video sequence (e.g., between two IRAPs)
++ s->present = 2;
++ return 0;
++}
++
++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
++{
++ get_ue_golomb_long(gb); // frame_packing_arrangement_id
++ s->present = !get_bits1(gb);
++
++ if (s->present) {
++ s->arrangement_type = get_bits(gb, 7);
++ s->quincunx_subsampling = get_bits1(gb);
++ s->content_interpretation_type = get_bits(gb, 6);
++
++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
++ skip_bits(gb, 3);
++ s->current_frame_is_frame0_flag = get_bits1(gb);
++ // frame0_self_contained_flag, frame1_self_contained_flag
++ skip_bits(gb, 2);
++
++ if (!s->quincunx_subsampling && s->arrangement_type != 5)
++ skip_bits(gb, 16); // frame[01]_grid_position_[xy]
++ skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte
++ skip_bits1(gb); // frame_packing_arrangement_persistence_flag
++ }
++ skip_bits1(gb); // upsampled_aspect_ratio_flag
++ return 0;
++}
++
++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
++{
++ s->present = !get_bits1(gb);
++
++ if (s->present) {
++ s->hflip = get_bits1(gb); // hor_flip
++ s->vflip = get_bits1(gb); // ver_flip
++
++ s->anticlockwise_rotation = get_bits(gb, 16);
++ skip_bits1(gb); // display_orientation_persistence_flag
++ }
++
++ return 0;
++}
++
++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
++ void *logctx, int size)
++{
++ HEVCSEIPictureTiming *h = &s->picture_timing;
++ HEVCRpiSPS *sps;
++
++ if (!ps->sps_list[s->active_seq_parameter_set_id])
++ return(AVERROR(ENOMEM));
++ sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
++
++ if (sps->vui.frame_field_info_present_flag) {
++ int pic_struct = get_bits(gb, 4);
++ h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
++ if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
++ av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
++ h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
++ } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
++ av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
++ h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
++ }
++ get_bits(gb, 2); // source_scan_type
++ get_bits(gb, 1); // duplicate_flag
++ skip_bits1(gb);
++ size--;
++ }
++ skip_bits_long(gb, 8 * size);
++
++ return 0;
++}
++
++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
++ int size)
++{
++ int flag;
++ int user_data_type_code;
++ int cc_count;
++
++ if (size < 3)
++ return AVERROR(EINVAL);
++
++ user_data_type_code = get_bits(gb, 8);
++ if (user_data_type_code == 0x3) {
++ skip_bits(gb, 1); // reserved
++
++ flag = get_bits(gb, 1); // process_cc_data_flag
++ if (flag) {
++ skip_bits(gb, 1);
++ cc_count = get_bits(gb, 5);
++ skip_bits(gb, 8); // reserved
++ size -= 2;
++
++ if (cc_count && size >= cc_count * 3) {
++ const uint64_t new_size = (s->a53_caption_size + cc_count
++ * UINT64_C(3));
++ int i, ret;
++
++ if (new_size > INT_MAX)
++ return AVERROR(EINVAL);
++
++ /* Allow merging of the cc data from two fields. */
++ ret = av_reallocp(&s->a53_caption, new_size);
++ if (ret < 0)
++ return ret;
++
++ for (i = 0; i < cc_count; i++) {
++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++ }
++ skip_bits(gb, 8); // marker_bits
++ }
++ }
++ } else {
++ int i;
++ for (i = 0; i < size - 1; i++)
++ skip_bits(gb, 8);
++ }
++
++ return 0;
++}
++
++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
++ int size)
++{
++ uint32_t country_code;
++ uint32_t user_identifier;
++
++ if (size < 7)
++ return AVERROR(EINVAL);
++ size -= 7;
++
++ country_code = get_bits(gb, 8);
++ if (country_code == 0xFF) {
++ skip_bits(gb, 8);
++ size--;
++ }
++
++ skip_bits(gb, 8);
++ skip_bits(gb, 8);
++
++ user_identifier = get_bits_long(gb, 32);
++
++ switch (user_identifier) {
++ case MKBETAG('G', 'A', '9', '4'):
++ return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
++ default:
++ skip_bits_long(gb, size * 8);
++ break;
++ }
++ return 0;
++}
++
++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
++{
++ int num_sps_ids_minus1;
++ int i;
++ unsigned active_seq_parameter_set_id;
++
++ get_bits(gb, 4); // active_video_parameter_set_id
++ get_bits(gb, 1); // self_contained_cvs_flag
++ get_bits(gb, 1); // num_sps_ids_minus1
++ num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
++
++ if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
++ av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
++ return AVERROR_INVALIDDATA;
++ }
++
++ active_seq_parameter_set_id = get_ue_golomb_long(gb);
++ if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
++ av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
++ return AVERROR_INVALIDDATA;
++ }
++ s->active_seq_parameter_set_id = active_seq_parameter_set_id;
++
++ for (i = 1; i <= num_sps_ids_minus1; i++)
++ get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
++
++ return 0;
++}
++
++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
++{
++ s->present = 1;
++ s->preferred_transfer_characteristics = get_bits(gb, 8);
++ return 0;
++}
++
++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
++ int type, int size)
++{
++ switch (type) {
++ case 256: // Mismatched value from HM 8.1
++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++ case HEVC_SEI_TYPE_FRAME_PACKING:
++ return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
++ case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
++ return decode_nal_sei_display_orientation(&s->display_orientation, gb);
++ case HEVC_SEI_TYPE_PICTURE_TIMING:
++ return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
++ case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
++ return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
++ case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
++ return decode_nal_sei_content_light_info(&s->content_light, gb);
++ case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
++ return decode_nal_sei_active_parameter_sets(s, gb, logctx);
++ case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
++ return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
++ case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
++ return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
++ default:
++ av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
++ skip_bits_long(gb, 8 * size);
++ return 0;
++ }
++}
++
++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ int type, int size)
++{
++ switch (type) {
++ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++ default:
++ av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
++ skip_bits_long(gb, 8 * size);
++ return 0;
++ }
++}
++
++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
++ const HEVCRpiParamSets * const ps, const int nal_unit_type)
++{
++ int payload_type = 0;
++ int payload_size = 0;
++ int byte = 0xFF;
++ av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
++
++ while (byte == 0xFF) {
++ if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
++ return AVERROR_INVALIDDATA;
++ byte = get_bits(gb, 8);
++ payload_type += byte;
++ }
++ byte = 0xFF;
++ while (byte == 0xFF) {
++ if (get_bits_left(gb) < 8 + 8LL*payload_size)
++ return AVERROR_INVALIDDATA;
++ byte = get_bits(gb, 8);
++ payload_size += byte;
++ }
++ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
++ } else { /* nal_unit_type == NAL_SEI_SUFFIX */
++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
++ }
++}
++
++static int more_rbsp_data(GetBitContext *gb)
++{
++ return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
++}
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ const HEVCRpiParamSets *ps, int type)
++{
++ int ret;
++
++ do {
++ ret = decode_nal_sei_message(gb, logctx, s, ps, type);
++ if (ret < 0)
++ return ret;
++ } while (more_rbsp_data(gb));
++ return 1;
++}
++
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
++{
++ s->a53_caption.a53_caption_size = 0;
++ av_freep(&s->a53_caption.a53_caption);
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.h
+@@ -0,0 +1,135 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_SEI_H
++#define AVCODEC_RPI_HEVC_SEI_H
++
++#include <stdint.h>
++
++#include "libavutil/md5.h"
++
++#include "get_bits.h"
++
++/**
++ * SEI message types
++ */
++typedef enum {
++ HEVC_SEI_TYPE_BUFFERING_PERIOD = 0,
++ HEVC_SEI_TYPE_PICTURE_TIMING = 1,
++ HEVC_SEI_TYPE_PAN_SCAN_RECT = 2,
++ HEVC_SEI_TYPE_FILLER_PAYLOAD = 3,
++ HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4,
++ HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5,
++ HEVC_SEI_TYPE_RECOVERY_POINT = 6,
++ HEVC_SEI_TYPE_SCENE_INFO = 9,
++ HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15,
++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17,
++ HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19,
++ HEVC_SEI_TYPE_POST_FILTER_HINT = 22,
++ HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23,
++ HEVC_SEI_TYPE_FRAME_PACKING = 45,
++ HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47,
++ HEVC_SEI_TYPE_SOP_DESCRIPTION = 128,
++ HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129,
++ HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130,
++ HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131,
++ HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132,
++ HEVC_SEI_TYPE_SCALABLE_NESTING = 133,
++ HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134,
++ HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137,
++ HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144,
++ HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
++} HEVC_SEI_Type;
++
++typedef struct HEVCSEIPictureHash {
++ uint8_t md5[3][16];
++ uint8_t is_md5;
++} HEVCSEIPictureHash;
++
++typedef struct HEVCSEIFramePacking {
++ int present;
++ int arrangement_type;
++ int content_interpretation_type;
++ int quincunx_subsampling;
++ int current_frame_is_frame0_flag;
++} HEVCSEIFramePacking;
++
++typedef struct HEVCSEIDisplayOrientation {
++ int present;
++ int anticlockwise_rotation;
++ int hflip, vflip;
++} HEVCSEIDisplayOrientation;
++
++typedef struct HEVCSEIPictureTiming {
++ int picture_struct;
++} HEVCSEIPictureTiming;
++
++typedef struct HEVCSEIA53Caption {
++ int a53_caption_size;
++ uint8_t *a53_caption;
++} HEVCSEIA53Caption;
++
++typedef struct HEVCSEIMasteringDisplay {
++ int present;
++ uint16_t display_primaries[3][2];
++ uint16_t white_point[2];
++ uint32_t max_luminance;
++ uint32_t min_luminance;
++} HEVCSEIMasteringDisplay;
++
++typedef struct HEVCSEIContentLight {
++ int present;
++ uint16_t max_content_light_level;
++ uint16_t max_pic_average_light_level;
++} HEVCSEIContentLight;
++
++typedef struct HEVCSEIAlternativeTransfer {
++ int present;
++ int preferred_transfer_characteristics;
++} HEVCSEIAlternativeTransfer;
++
++typedef struct HEVCSEIContext {
++ HEVCSEIPictureHash picture_hash;
++ HEVCSEIFramePacking frame_packing;
++ HEVCSEIDisplayOrientation display_orientation;
++ HEVCSEIPictureTiming picture_timing;
++ HEVCSEIA53Caption a53_caption;
++ HEVCSEIMasteringDisplay mastering_display;
++ HEVCSEIContentLight content_light;
++ int active_seq_parameter_set_id;
++ HEVCSEIAlternativeTransfer alternative_transfer;
++} HEVCSEIContext;
++
++struct HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ const struct HEVCRpiParamSets *ps, int type);
++
++/**
++ * Reset SEI values that are stored on the Context.
++ * e.g. Caption data that was extracted during NAL
++ * parsing.
++ *
++ * @param s HEVCRpiContext.
++ */
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
++
++#endif /* AVCODEC_RPI_HEVC_SEI_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.c
+@@ -0,0 +1,1537 @@
++#include "rpi_hevc_shader.h"
++
++#ifdef _MSC_VER
++ #include <stdint.h>
++ /* cast through uintptr_t to avoid warnings */
++ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
++#else
++ #define POINTER_TO_UINT(X) ((unsigned int)(X))
++#endif
++
++#ifdef __cplusplus
++extern "C" { /* the types are probably wrong... */
++#endif
++#ifdef __cplusplus
++}
++#endif
++
++#ifdef _MSC_VER
++__declspec(align(8))
++#elif defined(__GNUC__)
++__attribute__((aligned(8)))
++#endif
++unsigned int ff_hevc_rpi_shader[] = {
++// ::mc_setup_c_q0
++// ::mc_start
++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_c_qn
++/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif
++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
++/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif
++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif
++/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD
++/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
++/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y
++// :1
++/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c_p
++/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
++/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
++/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_p_l1
++/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
++/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
++/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax
++/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_b
++/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
++/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
++/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
++/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
++/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
++/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif
++/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
++/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y
++/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add
++/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4
++/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
++/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1
++/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
++/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif
++/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d
++// :1
++/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
++/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
++/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
++/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
++/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
++/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
++/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
++/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6
++/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax
++/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
++/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
++/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
++/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
++/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b
++/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4
++/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7
++/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11
++/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
++/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height
++/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_sync_q0
++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q1
++/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q2
++/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q3
++/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q4
++/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q5
++/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q6
++/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q7
++/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q8
++/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q9
++/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q10
++/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q11
++/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_qn
++// ::mc_exit_y_qn
++/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_q0
++// ::mc_exit_y_q0
++/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y_q0
++/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_y_qn
++/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif
++/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif
++/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch
++/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
++/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
++/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
++// :1
++/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
++/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
++/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
++/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
++// :per_block_setup_8
++/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
++/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
++/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
++/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
++/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
++/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
++/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
++/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
++/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add
++/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val
++/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255
++/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
++/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
++/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif
++/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
++/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8
++/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif
++/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
++// ::mc_filter_y_pxx
++/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height
++/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
++/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_bxx
++/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
++/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
++/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8
++/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
++/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch
++/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3
++/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_p00
++/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num
++/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif
++/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a
++/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif
++/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif
++/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif
++/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
++/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
++/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_b00
++/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0
++/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
++/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
++/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax
++/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
++/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_setup_c10_q0
++/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_c10_qn
++/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif
++/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
++/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
++/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
++/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif
++/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif
++/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD
++/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
++/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y
++// :1
++/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c10_p
++/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
++/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_p_l1
++/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
++/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax
++/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_b
++/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
++/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
++/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
++/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
++/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
++/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif
++/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
++/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y
++/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add
++/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4
++/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
++/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1
++/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
++/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif
++/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d
++// :1
++/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
++/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
++/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
++/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
++/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
++/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
++/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6
++/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax
++/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
++/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
++/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
++/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
++/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b
++/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4
++/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7
++/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11
++/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
++/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height
++/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_sync10_q0
++/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q1
++/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q2
++/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q3
++/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q4
++/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q5
++/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q6
++/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q7
++/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q8
++/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q9
++/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q10
++/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q11
++/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_q0
++// ::mc_exit_y10_q0
++/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_qn
++// ::mc_exit_y10_qn
++/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y10_q0
++/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_y10_qn
++/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif
++/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
++/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif
++/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch
++/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
++/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
++/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
++// :1
++/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
++/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
++/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
++/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
++// :per_block_setup_10
++/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
++/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
++/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
++/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
++/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
++/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
++/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
++/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
++/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add
++/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val
++/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255
++/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
++/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
++/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif
++/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
++/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8
++/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif
++/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
++// ::mc_filter_y10_pxx
++/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height
++/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
++/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_p00
++/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num
++/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif
++/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a
++/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif
++/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif
++/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif
++/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
++/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
++/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_bxx
++/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
++/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
++/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8
++/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
++/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch
++/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3
++/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_b00
++/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0
++/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
++/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax
++/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
++/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_end
++};
++#ifdef __HIGHC__
++#pragma Align_to(8, ff_hevc_rpi_shader)
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.h
+@@ -0,0 +1,63 @@
++#ifndef rpi_hevc_shader_H
++#define rpi_hevc_shader_H
++
++extern unsigned int ff_hevc_rpi_shader[];
++
++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
++#define mc_start (ff_hevc_rpi_shader + 0)
++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
++#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
++#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
++#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
++#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
++#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
++#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
++#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
++#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
++#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
++#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
++#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
++#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
++#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
++#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
++#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
++#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
++#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
++#define mc_end (ff_hevc_rpi_shader + 2860)
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.qasm
+@@ -0,0 +1,1850 @@
++# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++# All rights reserved.
++#
++# Redistribution and use in source and binary forms, with or without
++# modification, are permitted provided that the following conditions are met:
++# * Redistributions of source code must retain the above copyright
++# notice, this list of conditions and the following disclaimer.
++# * Redistributions in binary form must reproduce the above copyright
++# notice, this list of conditions and the following disclaimer in the
++# documentation and/or other materials provided with the distribution.
++# * Neither the name of the copyright holder nor the
++# names of its contributors may be used to endorse or promote products
++# derived from this software without specific prior written permission.
++#
++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++#
++# Written by Peter de Rivaz, John Cox
++
++
++
++# Inter pred asm
++#
++# Logic here should be good to 14 bits without modification
++# but only 8 & 10 are currently instantiated & tested
++# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
++# in _p00 & _b00
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4. As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
++# Number limits in P/B calculation
++#
++# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
++# we offset our intermediates s.t. they always end up +ve before the next
++# multiply (may be -ve whilst summing but that doesn't matter).
++#
++# Range calc for up to 14 bits (Y-B pred):
++#
++# denom: [0, 7]
++# bmax = (1 << bits) - 1
++# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
++#
++# wt_mul: [-128, 255]
++# wt_off = off * 2 + 1: [-bmax, bmax]
++#
++# pel: [0, bmax]
++# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
++# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
++# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
++# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
++# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
++# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
++#
++# This all looks good and is mostly bit depth independant - and as we manage
++# to do unsigned multiplies everywhere (now) this should be good for any bit
++# depth up to 14 (we could probably do 16 - but that requires a few tweaks
++# to the shifts we don't currently have logic for)
++
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# As the test for read-next is is the main part of the Luma loop (rather than
++# the preload FIFO part) we are limited to min_luma_height - 1
++# Min_luma_height is 4 so we can only have a preload of 3
++# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
++# in chroma without abandoning preload pretty much entirely (which would be bad)
++#
++# Timing tests vs preload of 4 suggests this doesn't hurt us much
++# Could have preread 4 for Chroma but when tested it didn't help
++
++.set PREREAD, 3
++
++# Offset added (effectively) at the exit of the H FIR filter
++# This is enough to force the result +ve
++# Is good if it is a power of 2 as that allows for >> without loss
++#
++# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
++# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
++# Round up to next power of 2
++
++.set FIR_OFFSET, 0x4000
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8, 16
++.set C_BLK_HEIGHT_16, 8
++.set Y_BLK_HEIGHT_8, 16
++.set Y_BLK_HEIGHT_16, 8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
++
++.set N_QPU_8, 12
++.set N_QPU_16, 12
++
++# Value to add to the weight multiplier to convert it into an unsigned value
++# Should be power of two for convienience
++
++.set LOG2_MUL_ADD, 14
++.set MUL_ADD, (1 << LOG2_MUL_ADD)
++
++# Fixed denom (max that it can be set to)
++.set DENOM, 7
++
++# register allocation
++#
++
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
++
++# ra4-11
++# V FIFO / temp / free
++
++# -- free -- ra12
++
++# -- free -- ra13
++
++# -- free -- ra14
++
++# -- free -- ra15
++
++# uniform: width:height
++.set ra_width_height, ra16
++.set ra_width, ra16.16b
++.set ra_height, ra16.16a
++
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2, ra17
++.set ra_y2, ra17.16a
++.set ra_y, ra17.16b
++
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1, ra18
++.set ra_wt_off_l1, ra18.16b
++.set ra_wt_mul_l1, ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next, ra19
++.set ra_y_next, ra19.16b
++.set ra_y2_next, ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff800100, ra20
++.set ra_k256, ra20.16a
++.set ra_k0, ra20.8a
++.set ra_k1, ra20.8b
++.set ra_k128, ra20.8c
++.set ra_k255, ra20.8d
++
++# Loop: xshifts
++.set ra_xshift, ra21.16a
++.set ra_xshift_next, ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0, ra22
++.set ra_wt_mul_l0, ra22.16a
++.set ra_wt_off_l0, ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++# 2nd byte but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax, ra23
++.set ra_pmax, ra23.16a
++.set ra_blk_height, ra23.8c
++# --free -- ra23.8d
++
++# Loop: src frame base (L0)
++.set ra_base, ra24
++
++# Misc offsets
++.set ra_fir_off_val_wt_den_p7, ra25
++.set ra_wt_den_p7, ra25.8a
++# -- free -- ra25.8b
++.set ra_fir_off_val, ra25.16b
++
++# As it happens these constants are the same
++.if FIR_OFFSET == MUL_ADD
++# Weight multiplier unsigned add
++.set ra_kmul_add, ra_fir_off_val
++.else
++.error "FIR_OFFSET != MUL_ADD: Need new register & init"
++.endif
++
++# Loop: next src frame base (L0)
++.set ra_base_next, ra26
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set ra_dma0, ra27
++
++# Loop: destination address
++.set ra_dest, ra28
++
++# Setup: Dup of rb_ef
++# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
++# (top bits are ignored by mul24)
++.set ra_ef, ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link, ra30
++
++# -- free -- ra31
++
++.set rb_xshift2, rb0
++.set rb_xshift2_next, rb1
++
++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x, rb2
++
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++# Duped into ra_ef as sometimes that is easier to use
++.set rb_ef, rb3
++
++# rb4-11
++# Loop: V filter FIFO or V filter coeff
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off, rb12
++
++# -- free -- rb13
++
++# -- free -- rb14
++
++# Loop: src frame base (L1)
++.set rb_base2, rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch, rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu, rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount, rb18
++
++# frame_base2_next
++.set rb_base2_next, rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
++.set rb_xpitch, rb20
++
++# These 3 consts each save 1 instruction in Y loop setup
++# so whilst they are worthwhile they should be the 1st to die if we need
++# another b reg
++.set rb_y_coeffs_2, rb21 # 0x050b0a00
++.set rb_y_coeffs_3, rb22 # 0x11283a40
++.set rb_y_coeffs_5, rb23 # 0x0a0b0500
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask, rb24
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base, rb25
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x, rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base, rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init, rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1, rb29
++
++# Setup: pic_height - 1
++.set rb_max_y, rb30
++
++# Setup: FIR H offset
++.set rb_fir_off_h, rb31
++
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16, -16
++.set i_shift21, -11
++.set i_shift23, -9
++.set i_shift30, -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
++ mov r2, qpu_num
++.if v_bit_depth <= 8
++ # 8 bit version
++ asr r1, r2, 2
++ shl r1, r1, 6
++ and r0, r2, 3
++ or r0, r0, r1
++
++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++ add r_vpm, r0, r1 # VPM 8bit storage
++
++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++ shl r0, r0, 5
++
++.else
++ # 16 bit version
++ # Limited to 8 QPUs if blk height > 8
++ asr r1, r2, 1
++.if v_blk_height <= 8
++ shl r1, r1, 4
++.else
++ shl r1, r1, 5
++.endif
++ and r0, r2, 1
++ or r0, r0, r1
++
++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR
++ add r_vpm, r0, r1
++
++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
++ shl r0, r0, 6
++.endif
++ add r_dma, r0, r1 # DMA out
++.endm
++
++
++.macro m_setup_q0
++ srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
++################################################################################
++# mc_setup_c
++#
++# typedef struct qpu_mc_pred_c_s_s {
++# int16_t y;
++# int16_t x;
++# uint32_t base;
++# uint32_t pic_cw; // C Width (== Y width / 2)
++# uint32_t pic_ch; // C Height (== Y Height / 2)
++# uint32_t stride2;
++# uint32_t stride1;
++# uint32_t wdenom;
++# int16_t y2;
++# int16_t x2;
++# uint32_t base2;
++# uint32_t next_fn;
++# } qpu_mc_pred_c_s_t;
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_pmask, 0xff
++.set v_blk_height, C_BLK_HEIGHT_8
++.else
++.set v_x_shift, 2
++.set v_pmask, 0xffff
++.set v_blk_height, C_BLK_HEIGHT_16
++.endif
++
++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
++
++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++ shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base
++
++# Read image dimensions
++ sub r0, unif, 1 # pic c width
++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes
++ sub rb_max_y, unif, 1 # pic c height
++
++# load constants
++ mov ra_kff800100, 0xff800100
++ mov rb_pmask, v_pmask
++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++
++# get source pitch
++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2
++ mov rb_pitch, unif # stride1
++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly
++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1
++
++ and r0, 1, elem_num
++ nop ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++ add rb_elem_x, r0, elem_num
++.else
++ add r0, r0, elem_num
++ add rb_elem_x, r0, r0
++.endif
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay]
++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice
++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y
++ min r0, r0, rb_max_x
++
++# Get shift
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++ shl ra_xshift_next, r0, 3
++.else
++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++.endif
++
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++
++.if v_bit_depth <= 8
++ and r0, r0, -4
++.endif
++ sub r1, ra_k0, rb_pitch
++ and r1, r0, r1
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2
++ add ra_base, ra_base, r0
++
++# Compute part of VPM to use for DMA output
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++# And again for L1, but only worrying about frame2 stuff
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# rb_base2 ends up with t1s base
++
++ shl r0, ra0.16b, v_x_shift
++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset
++ max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2
++ min r0, r0, rb_max_x
++
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++ shl rb_xshift2_next, r0, 3
++.endif
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++.if v_bit_depth <= 8
++ and r0, r0, -4
++.endif
++ sub r1, ra_k0, rb_pitch
++ and r1, r0, r1 ; mov r3, PREREAD
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov r2, ra_y2
++ add rb_base2, rb_base2, r0 ; mov r0, ra_y
++
++# Do preloads
++# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
++
++:1
++ sub.setf r3, r3, 1
++ max r1, r0, 0
++ min r1, r1, rb_max_y
++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t0s, ra_base, r1 ; mov ra_y, r0
++
++ max r1, r2, 0
++ brr.anynz -, r:1b
++ min r1, r1, rb_max_y
++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t1s, rb_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++ mov ra_link, unif # link
++# touch registers to keep simulator happy (and fills in delay slots)
++ mov ra4, 0 ; mov rb4, 0
++ bra -, ra_link
++ mov ra5, 0 ; mov rb5, 0
++ mov ra6, 0 ; mov rb6, 0
++ mov ra7, 0 ; mov rb7, 0
++# >>> ra_link
++.endm
++
++::mc_setup_c_q0
++ m_setup_q0
++::mc_setup_c_qn
++ m_setup_c 8
++
++################################################################################
++#
++# mc_filter_c_p
++#
++# typedef struct qpu_mc_pred_c_p_s {
++# int16_t y;
++# int16_t x;
++# uint32_t base;
++# uint16_t h;
++# uint16_t w;
++# uint32_t coeffs_x;
++# uint32_t coeffs_y;
++# uint32_t wo_u;
++# uint32_t wo_v;
++# uint32_t dst_addr_c;
++# uint32_t next_fn;
++# } qpu_mc_pred_c_p_t;
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_x_mul, 2
++.set v_v_shift, 8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 2
++.set v_x_mul, 4
++.set v_v_shift, i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift, rb_xshift2 # b side more convienient
++.set vrx_xshift_next, ra_xshift_next
++.set vra_y_next, ra_y_next
++.set vrx_base_next, ra_base_next
++.set vra_y, ra_y
++.set vra_base, ra_base
++.set vr_txs, t0s
++.else
++.set vrx_xshift, ra_xshift # a side more convienient
++.set vrx_xshift_next, rb_xshift2_next
++.set vra_y_next, ra_y2_next
++.set vrx_base_next, rb_base2_next
++.set vra_y, ra_y2
++.set vra_base, rb_base2
++.set vr_txs, t1s
++.endif
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++# get base addresses and per-channel shifts for *next* invocation
++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++
++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base
++
++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs
++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++
++.if v_bit_depth <= 8
++ shl vrx_xshift_next, r0, 3
++ and r0, r0, -4
++.endif
++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced!
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs
++ add vrx_base_next, r3, r0 ; mov r1, ra_height
++
++# set up VPM write
++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
++
++# Misc final setup...
++
++ shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr
++ add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2)
++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register
++ add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight
++ shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++ sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++ add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4)
++ mov rb11, ra3.8d ; mov ra_link, unif # ; Link
++
++# r5 = -4 (loop counter)
++# ra_wt_mul_l0 = weight L0 + 128 (now unsigned)
++# rb_wt_off = (offset * 2 + 1) << (wt_den + 5)
++# rb31 = FIR value offset
++
++# FIFO: rb4, ra5, rb6, ra7
++# Coeffs in ra3.8a, ra3.8b, rb10, rb11
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++# C0 : C3 : C1 : C4 : C2 : C5 : ...
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++.if v_tmu == 0
++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++.else
++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay]
++.endif
++
++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++ min r3, r3, rb_max_y ; mov.ifnc r0, r2
++
++ and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++.if v_tmu == 0
++ add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes
++.else
++ add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes
++.endif
++
++# apply horizontal filter
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are valid for all QPUs
++
++ add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++ sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
++# We would like to save the r5->r4 shift but we need a delay slot
++# for both r7 & r6 which we can't find anything to put in if we have
++# already multiplied r4 & r5!
++ brr.anyn -, r:1b
++ add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post
++ mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post
++ asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++# >>> .anyn 1b
++
++ add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay]
++ sub r1, r1, r0 ; mul24 r0, ra7, rb11
++ sub r1, r1, r0
++
++ asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop
++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop
++ brr.anyn -, r:1b
++ asr r1, r1, i_wt_den_p6
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_p
++ m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++ m_filter_c_p 1, 8
++
++################################################################################
++#
++# mc_filter_c_b
++#
++# typedef struct qpu_mc_pred_c_b_s {
++# int16_t y;
++# int16_t x;
++# uint32_t base;
++# uint16_t h;
++# uint16_t w;
++# uint32_t coeffs_x1;
++# uint32_t coeffs_y1;
++# int16_t weight_u1;
++# int16_t weight_v1;
++# int16_t y2;
++# int16_t x2;
++# uint32_t base2;
++# uint32_t coeffs_x2;
++# uint32_t coeffs_y2;
++# uint32_t wo_u2;
++# uint32_t wo_v2;
++# uint32_t dst_addr_c;
++# uint32_t next_fn;
++# } qpu_mc_pred_c_b_t;
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_v_shift, 8
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 2
++.set v_v_shift, i_shift16
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++.set v_x_mul, (1 << v_x_shift)
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++
++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base
++
++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0
++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++ min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs
++
++.if v_bit_depth <= 8
++ shl ra_xshift_next, r0, 3
++.endif
++
++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs
++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs)
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height
++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++
++# set up VPM write
++
++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
++
++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2
++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base
++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register
++ add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x
++
++# L1 - uniform layout could possibly be optimized
++
++ shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<<shift ; L1 H filter coeffs
++ add r0, r0, rb_elem_x ; mov ra3, unif # ; L1 V filter coeffs
++ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
++ max r0, r0, r5 ; mov ra9, rb_max_y
++ min r0, r0, rb_max_x ; mov r2, ra_kmul_add
++
++.if v_bit_depth <= 8
++ shl rb_xshift2_next, r0, 3
++.endif
++
++ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++ and r1, r0, r1 ; mov r5rep, -4
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dst_addr
++ add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
++
++ add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++ add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++ add r0, r0, r1 ; mov r1, ra_wt_off_l1 # ; L0 off unset
++ shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
++ sub rb_wt_off, r1, r0 ; mov ra_link, unif # ; link
++
++ mov ra10, rb_xshift2 ; mov rb7, ra2.8d
++
++# r5 loop counter (-4)
++# ra0 H coeffs L0
++# ra1 H coeffs L1
++# ra2 V coeffs L0
++# ra3 V coeffs L1
++# ra9 rb_max_y alias
++# ra10 rb_xshift2 alias
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
++ shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
++ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next # [ra_y delay]
++ add ra_y, 1, ra_y ; mov r3, ra_y
++
++ max r3, r3, ra_k0 ; mov r0, r1 << 15
++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++
++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # ; masks bytes
++
++# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
++
++ and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
++ sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
++ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++ add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
++
++ shr r2, r4, ra10 ; mov rb5, rb6
++ shr r1, r2, v_v_shift ; mov r3, ra_y2
++ shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 # [r1 << delay]
++
++ add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++ min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
++
++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++ add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax # ; masks bytes
++
++# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
++
++ add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
++ sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
++ sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++ add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++
++ brr.anyn -, r:1b
++ add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
++ mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
++ shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++# >>> .anyn 1b
++
++ sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0
++ sub.setf -, r5, rb_lcount ; mov r0, ra4
++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++ add r1, r1, r0 ; mul24 r0, ra7, rb7
++
++ sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1
++ add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1
++ sub r2, r2, r0
++
++ shr r1, r1, 6
++ shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
++ add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
++ add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
++ sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop
++
++ brr.anyn -, r:1b
++ asr r1, r1, ra_wt_den_p7
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_b
++ m_filter_c_b 8
++
++################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++ nop ; nop ; ldtmu0
++ nop ; nop ; ldtmu1
++ nop ; nop ; ldtmu0
++ mov -, vw_wait ; nop ; ldtmu1
++.else
++ mov.setf r3, PREREAD - 1
++:1
++ brr.anynz -, r:1b
++ nop ; nop ; ldtmu0
++ nop ; nop ; ldtmu1
++ sub.setf r3, r3, 1
++ # >>>
++ mov -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart. Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 - fns should never be called
++.if n_qpu < n_quads * 4
++ mov ra_link, unif # Can only branch to an a reg (not r0)
++ mov -, vw_wait # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in, 12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++ sacq -, n_sem_sync
++ sacq -, n_sem_sync
++ sacq -, n_sem_sync
++ bra -, ra_link
++ sacq -, n_sem_quad_in
++ srel -, n_sem_out
++ srel -, n_sem_quad_out
++
++.else
++ bra -, ra_link
++ srel -, n_sem_sync
++ sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++ srel -, n_sem_out
++.else
++ nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++ m_sync_q 0, v_quads8
++::mc_sync_q1
++ m_sync_q 1, v_quads8
++::mc_sync_q2
++ m_sync_q 2, v_quads8
++::mc_sync_q3
++ m_sync_q 3, v_quads8
++::mc_sync_q4
++ m_sync_q 4, v_quads8
++::mc_sync_q5
++ m_sync_q 5, v_quads8
++::mc_sync_q6
++ m_sync_q 6, v_quads8
++::mc_sync_q7
++ m_sync_q 7, v_quads8
++::mc_sync_q8
++ m_sync_q 8, v_quads8
++::mc_sync_q9
++ m_sync_q 9, v_quads8
++::mc_sync_q10
++ m_sync_q 10, v_quads8
++::mc_sync_q11
++ m_sync_q 11, v_quads8
++
++# mc_exit()
++# Chroma & Luma the same now
++
++.macro m_exit_qn
++ m_exit_drain
++ nop ; nop ; thrend
++ nop
++ nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++ m_exit_qn
++
++
++
++# mc_interrupt_exit12()
++
++.macro m_exit_q0
++ m_exit_drain
++ sacq -, 12
++ nop ; nop ; thrend
++ mov interrupt, 1
++ nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_q0
++::mc_exit_y_q0
++ m_exit_q0
++
++# LUMA CODE
++
++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
++# For P frames we make the second x,y coordinates offset by +8
++
++
++################################################################################
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++# qpu_mc_src_t next_src1;
++# qpu_mc_src_t next_src2;
++# uint16_t pic_h;
++# uint16_t pic_w;
++# uint32_t stride2;
++# uint32_t stride1;
++# uint32_t wdenom;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_pmask, 0xff
++.set v_blk_height, Y_BLK_HEIGHT_8
++.else
++.set v_x_shift, 1
++.set v_pmask, 0xffff
++.set v_blk_height, Y_BLK_HEIGHT_16
++.endif
++
++
++ # Need to save these because we need to know the frame dimensions before computing texture coordinates
++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
++ mov ra9, unif # ref_y_base
++ mov ra1, unif # x2_y2
++
++
++# load constants
++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++ shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base
++
++ mov ra_kff800100, 0xff800100
++ mov rb_pmask, v_pmask
++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++ mov rb_y_coeffs_2, 0x050b0a00
++ mov rb_y_coeffs_3, 0x11283a40
++ mov rb_y_coeffs_5, 0x0a0b0500
++
++# Compute part of VPM to use
++
++# Read image dimensions
++ mov ra3, unif # width_height
++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2
++.if v_x_shift == 0
++ sub rb_max_x, ra3.16b, 1
++.else
++ sub r0, ra3.16b, 1
++ shl rb_max_x, r0, v_x_shift
++.endif
++ sub rb_max_y, ra3.16a, 1
++ mov r3, elem_num ; mov rb_pitch, unif # stride1
++
++# get destination pitch
++ mov r1, vdw_setup_1(0) # [rb_pitch delay]
++ or rb_dma1_base, r1, rb_pitch
++
++# Compute base address for first and second access
++ add r0, ra0.16b, r3 # Load x + elem_num
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++ shl ra_xshift_next, r0, 3 # Compute shifts
++
++# X is byte offset - we can only load words - mask
++
++ and r0, r0, -4 ; v8subs r2, r2, r2
++ sub r2, r2, rb_pitch
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 # Add stripe offsets
++ add ra_base, ra9, r0
++
++ # r3 still contains elem_num
++ add r0, ra1.16b, r3 # Load x
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++ shl rb_xshift2_next, r0, 3 # Compute shifts
++
++ # r2 still contains mask
++ and r0, r0, -4
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 # Add stripe offsets
++ add rb_base2, ra11, r0
++
++# Do preloads
++ nop ; mov r0, ra0.16a # ; r0 = y
++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2
++
++:1
++ sub.setf r3, r3, 1
++ max r1, r0, 0
++ min r1, r1, rb_max_y
++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t0s, ra_base, r1 ; mov ra_y, r0
++
++ max r1, r2, 0
++ brr.anynz -, r:1b
++ min r1, r1, rb_max_y
++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t1s, rb_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++ mov ra_link, unif # Next fn
++
++# touch vertical context to keep simulator happy
++ mov ra8, 0 ; mov rb8, 0 # [ra_link delay]
++ bra -, ra_link
++ mov ra9, 0 ; mov rb9, 0
++ mov ra10, 0 ; mov rb10, 0
++ mov ra11, 0 ; mov rb11, 0
++# >>> ra_link
++.endm
++
++::mc_setup_y_q0
++ m_setup_q0
++::mc_setup_y_qn
++ m_setup_y 8
++
++################################################################################
++#
++# Start of per-block setup code
++# P and B blocks share the same setup code to save on Icache space
++
++# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
++
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++# qpu_mc_src_t next_src1;
++# qpu_mc_src_t next_src2;
++# uint16_t h;
++# uint16_t w;
++# uint32_t mymx21;
++# uint32_t wo1;
++# uint32_t wo2;
++# uint32_t dst_addr;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++ brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++ brr ra_link, r:per_block_setup_10
++.endif
++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack??
++ add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_x_mul, 1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 1
++.set v_x_mul, 2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++ min r0, r0, rb_max_x
++
++ shl ra_xshift_next, r0, 3 # Compute shifts
++ and r0, r0, -4
++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base
++ and r1, r0, r2 ; mov ra_y_next, ra0.16a
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y
++ add ra_base_next, ra_base_next, r0 # [ra1 delay]
++
++ add r0, ra1.16b, r3 # Load x2
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a
++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base
++ shl rb_xshift2_next, r0, 3 # Compute shifts
++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height
++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
++ add rb_base2_next, rb_base2_next, r0
++
++# get width,height of block (unif load above), r1 = width * pel_size
++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
++ add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++ add rb_lcount, r0, (7-8)
++ shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val
++ add r0, r0, r1 # Combine width and height of destination area
++ shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
++ add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets
++
++# get filter coefficients and discard unused B frame values
++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight
++ shl ra8, r0, 3 ; mov rb5, ra_k255
++
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++
++# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
++# but I can't see a way of doing that that is cheap enough to be worth it
++
++# Picked out in a slightly random order to space out uniform loads
++
++ # 1
++ mov r1, 0x01040400 # [ra8 delay]
++ ror ra2.8b, r1, ra8.8d
++ ror ra0.8b, r1, ra8.8c
++ # 2
++ ror ra2.8c, rb_y_coeffs_2, ra8.8d
++ ror ra0.8c, rb_y_coeffs_2, ra8.8c
++ # 0
++ mov r1,0x00010100 # -ve [ra8 delay]
++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset
++ ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
++ # 7
++ shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
++ ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address
++ ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
++ # 3
++ ror ra2.8d, rb_y_coeffs_3, ra8.8d
++ ror ra0.8d, rb_y_coeffs_3, ra8.8c
++ # 5
++ ror ra3.8b, rb_y_coeffs_5, ra8.8d
++ ror ra1.8b, rb_y_coeffs_5, ra8.8c
++ # 6
++ mov r1,0x04040100
++ ror ra3.8c, r1, ra8.8d
++ ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val
++
++ bra -, ra_link
++ # 4
++ mov r1,0x3a281100
++ ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val
++ ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
++# >>> branch ra_link
++
++# r5 = -8
++# r2 = fir_off_val
++# r3 = 128
++.endm
++
++:per_block_setup_8
++ m_per_block_setup 8
++
++
++
++################################################################################
++#
++# mc_filter_y_pxx
++#
++# Setup (& therefore uniform struct) shared with _bxx
++# Struct in m_luma_setup
++#
++# We can have 2 separate P reqs here as long as they mate to generate a
++# rectangular output block (i.e. h0 = h1, w0 = 8)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_pxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++ m_luma_setup v_bit_depth
++
++ shl r1, ra_wt_off_l0, i_wt_den_p5
++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# This loop is identical to the B loop from here --->
++:1
++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++
++ max r2, ra_y, 0 ; mov r1, 0
++ min r2, r2, rb_max_y ; mov r3, ra_k1
++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++ add t0s, ra_base, r2 ; mov rb5, rb6
++ shr r0, r4, ra_xshift ; mov rb6, rb7
++
++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes
++ shr r1, r4, rb_xshift2 ; mov rb7, ra8
++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++ add t1s, rb_base2, r2 ; mov ra8, ra9
++
++# apply horizontal filter
++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++ brr.anyn -, r:1b
++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++ # >>> .anyn 1b (r5 + r5)
++
++ # apply vertical filter and write to VPM
++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++ add r1, r1, r0 ; mul24 r0, ra8, rb8
++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++ add r1, r1, r0 ; mul24 r0, ra11, rb11
++# <--- to here
++ sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height
++ sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
++ sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
++
++ asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate)
++
++ brr.anyn -, r:1b
++ asr r1, r1, i_wt_den_p6
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_pxx
++ m_filter_y_pxx 8
++
++
++################################################################################
++
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++#
++# Setup (& therefore uniform struct) shared with _pxx
++# Struct in m_luma_setup
++#
++# l0 calc in els 0-7, L1 in 8-15
++# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_bxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++ m_luma_setup v_bit_depth
++
++ shl r1, ra_wt_off_l0, i_wt_den_p6
++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++ sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++
++# This loop is identical to the P loop from here --->
++:1
++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++
++ max r2, ra_y, 0 ; mov r1, 0
++ min r2, r2, rb_max_y ; mov r3, ra_k1
++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++ add t0s, ra_base, r2 ; mov rb5, rb6
++ shr r0, r4, ra_xshift ; mov rb6, rb7
++
++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes
++ shr r1, r4, rb_xshift2 ; mov rb7, ra8
++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++ add t1s, rb_base2, r2 ; mov ra8, ra9
++
++# apply horizontal filter
++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++ brr.anyn -, r:1b
++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++ # >>> .anyn 1b (r5 + r5)
++
++ # apply vertical filter and write to VPM
++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++ add r1, r1, r0 ; mul24 r0, ra8, rb8
++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++ add r1, r1, r0 ; mul24 r0, ra11, rb11
++# <--- to here
++ sub r1, r1, ra4
++ sub r1, r1, r0 ; mov r2, rb_wt_off
++
++ asr r1, r1, 6
++ sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
++ mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++ sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
++ sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
++ add r1, r1, r2 ; mov r0, r1 << 8
++ add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height
++
++ brr.anyn -, r:1b
++ asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate)
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed block_height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link (ra_height - remaining height)
++
++# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_bxx
++ m_filter_y_bxx 8
++
++################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++# qpu_mc_src_t next_src1;
++# uint16_t h;
++# uint16_t w;
++# uint32_t wo1;
++# uint32_t dst_addr;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_x_mul, 1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 1
++.set v_x_mul, 2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++ mov ra0, unif ; mov r0, elem_num # y_x
++ mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0
++ add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++
++ max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height
++ min r0, r0, rb_max_x ; mov ra_width_height, unif
++
++ shl ra_xshift_next, r0, 3 # Compute shifts
++ and r0, r0, -4
++ sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr
++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++ shl r1, ra_width, v_x_shift
++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++ add r0, r0, r1 # Combine width and height of destination area
++ shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++ shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link
++ add ra_dma0, r0, rb_dma0_base
++
++:1
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
++
++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++ shl r1, r1, 8 ; mov r3, ra_blk_height
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++ brr.anyn -, r:1b
++ asr r1, r1, DENOM + 8
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++ m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++ m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++ mov r2, 1
++ add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want
++ shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
++
++ max r2, ra_y2, 0
++ min r2, r2, rb_max_y
++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++ add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte
++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++
++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++ add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
++
++ shl r1, r1, 8 ; mov r3, ra_blk_height
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++ brr.anyn -, r:1b
++ asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++ m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++ m_setup_q0
++::mc_setup_c10_qn
++ m_setup_c 10
++
++::mc_filter_c10_p
++ m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++ m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++ m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++ m_sync_q 0, v_quads10
++::mc_sync10_q1
++ m_sync_q 1, v_quads10
++::mc_sync10_q2
++ m_sync_q 2, v_quads10
++::mc_sync10_q3
++ m_sync_q 3, v_quads10
++::mc_sync10_q4
++ m_sync_q 4, v_quads10
++::mc_sync10_q5
++ m_sync_q 5, v_quads10
++::mc_sync10_q6
++ m_sync_q 6, v_quads10
++::mc_sync10_q7
++ m_sync_q 7, v_quads10
++::mc_sync10_q8
++ m_sync_q 8, v_quads10
++::mc_sync10_q9
++ m_sync_q 9, v_quads10
++::mc_sync10_q10
++ m_sync_q 10, v_quads10
++::mc_sync10_q11
++ m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++ m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++ m_exit_qn
++
++::mc_setup_y10_q0
++ m_setup_q0
++::mc_setup_y10_qn
++ m_setup_y 10
++
++:per_block_setup_10
++ m_per_block_setup 10
++
++::mc_filter_y10_pxx
++ m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++ m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++ m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++ m_filter_y_b00 10
++
++
++
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_cmd.h
+@@ -0,0 +1,165 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++ int16_t y;
++ int16_t x;
++ qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++ qpu_mc_src_t next_src;
++ uint16_t h;
++ uint16_t w;
++ uint32_t coeffs_x;
++ uint32_t coeffs_y;
++ uint32_t wo_u;
++ uint32_t wo_v;
++ qpu_mc_dst_addr_t dst_addr_c;
++ uint32_t next_fn;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++ qpu_mc_src_t next_src1;
++ uint16_t h;
++ uint16_t w;
++ uint32_t coeffs_x1;
++ uint32_t coeffs_y1;
++ int16_t weight_u1;
++ int16_t weight_v1;
++ qpu_mc_src_t next_src2;
++ uint32_t coeffs_x2;
++ uint32_t coeffs_y2;
++ uint32_t wo_u2;
++ uint32_t wo_v2;
++ qpu_mc_dst_addr_t dst_addr_c;
++ uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++ qpu_mc_src_t next_src1;
++ uint32_t pic_cw; // C Width (== Y width / 2)
++ uint32_t pic_ch; // C Height (== Y Height / 2)
++ uint32_t stride2;
++ uint32_t stride1;
++ qpu_mc_src_t next_src2;
++ uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
++ union {
++ qpu_mc_pred_c_p_t p;
++ qpu_mc_pred_c_b_t b;
++ qpu_mc_pred_c_s_t s;
++ };
++} qpu_mc_pred_c_t;
++
++
++typedef struct qpu_mc_pred_y_p_s {
++ qpu_mc_src_t next_src1;
++ qpu_mc_src_t next_src2;
++ uint16_t h;
++ uint16_t w;
++ uint32_t mymx21;
++ uint32_t wo1;
++ uint32_t wo2;
++ qpu_mc_dst_addr_t dst_addr;
++ uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++ qpu_mc_src_t next_src1;
++ uint16_t h;
++ uint16_t w;
++ uint32_t wo1;
++ qpu_mc_dst_addr_t dst_addr;
++ uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++ qpu_mc_src_t next_src1;
++ qpu_mc_src_t next_src2;
++ uint16_t pic_h;
++ uint16_t pic_w;
++ uint32_t stride2;
++ uint32_t stride1;
++ uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++typedef struct qpu_mc_pred_sync_s {
++ uint32_t next_fn;
++} qpu_mc_pred_sync_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++ union {
++ qpu_mc_pred_y_p_t p;
++ qpu_mc_pred_y_p00_t p00;
++ qpu_mc_pred_y_s_t s;
++ };
++} qpu_mc_pred_y_t;
++
++typedef union qpu_mc_pred_cmd_u {
++ qpu_mc_pred_y_t y;
++ qpu_mc_pred_c_t c;
++ qpu_mc_pred_sync_t sync;
++} qpu_mc_pred_cmd_t;
++
++static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
++{
++ // Link is last el of previous cmd
++ ((uint32_t *)cmd)[-1] = fn;
++}
++
++#define QPU_MC_PRED_N_Y8 12
++#define QPU_MC_PRED_N_C8 12
++
++#define QPU_MC_PRED_N_Y10 12
++#define QPU_MC_PRED_N_C10 12
++
++#define QPU_MC_DENOM 7
++
++#pragma pack(pop)
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.c
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++
++typedef struct shader_track_s
++{
++ const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++ const struct qpu_mc_src_s *last_l0;
++ const struct qpu_mc_src_s *last_l1;
++ uint32_t width; // pic_width * PW
++ uint32_t height;
++ uint32_t stride2;
++ uint32_t stride1;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++ return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++ int rv;
++ // As it happens we can take the 2nd filter term & divide it by 8
++ // (dropping fractions) to get the fractional move
++ rv = 8 - ((x >> 11) & 0xf);
++ av_assert2(rv >= 0 && rv <= 7);
++ return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++ return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCRpiContext *const s, int32_t x)
++{
++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCRpiContext *const s, int32_t x)
++{
++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++ return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_hevc_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_hevc_shader_template_fn.h"
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.h
+@@ -0,0 +1,49 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++struct HEVCRpiContext;
++struct HEVCRpiInterPredEnv;
++
++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
++ const struct HEVCRpiInterPredEnv *const ipe_y,
++ const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
++ const struct HEVCRpiInterPredEnv *const ipe_y,
++ const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template_fn.h
+@@ -0,0 +1,502 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++ const pixel s = *(const pixel *)src;
++ pixel * d = (pixel *)dst;
++ for (unsigned int j = 0; j < w; j += PW) {
++ *d++ = s;
++ }
++ }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++ for (unsigned int i = 0; i != h; ++i, dst += stride) {
++ memcpy(dst, src, w);
++ }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++ uint8_t * dst, const unsigned int dst_stride,
++ const qpu_mc_src_t *src,
++ unsigned int _w, unsigned int _h)
++{
++ int x = src->x * PW;
++ int y = src->y;
++ int w = _w * PW;
++ int h = _h;
++ int dl = 0;
++ int dr = 0;
++ int dt = 0;
++ int db = 0;
++
++ if (x < 0) {
++ if (-x >= w)
++ x = PW - w;
++ dl = -x;
++ w += x;
++ x = 0;
++ }
++ if (x + w > st->width) {
++ if (x >= st->width)
++ x = st->width - PW;
++ dr = (x + w) - st->width;
++ w = st->width - x;
++ }
++
++ // Y
++ if (y < 0) {
++ if (-y >= h)
++ y = 1 - h;
++ dt = -y;
++ h += y;
++ y = 0;
++ }
++ if (y + h > st->height) {
++ if (y >= st->height)
++ y = st->height - 1;
++ db = (y + h) - st->height;
++ h = st->height - y;
++ }
++
++ dst += dl + dt * dst_stride;
++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++ // Edge dup
++ if (dl != 0)
++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++ if (dr != 0)
++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++ w += dl + dr;
++ dst -= dl;
++
++ if (dt != 0)
++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++ if (db != 0)
++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++ const qpu_mc_src_t *src,
++ unsigned int _w, unsigned int _h)
++{
++ int x = src->x * PW;
++ int y = src->y;
++ int w = _w * PW;
++ int h = _h;
++ int dl = 0;
++ int dr = 0;
++ int dt = 0;
++ int db = 0;
++ const int width = st->width;
++ const int height = st->height;
++
++ if (x < 0) {
++ if (-x >= w)
++ x = PW - w;
++ dl = -x;
++ w += x;
++ x = 0;
++ }
++ if (x + w > width) {
++ if (x >= width)
++ x = width - PW;
++ dr = (x + w) - width;
++ w = width - x;
++ }
++
++ // Y
++ if (y < 0) {
++ if (-y >= h)
++ y = 1 - h;
++ dt = -y;
++ h += y;
++ y = 0;
++ }
++ if (y + h > height) {
++ if (y >= height)
++ y = height - 1;
++ db = (y + h) - height;
++ h = height - y;
++ }
++
++ dst_u += dl + dt * dst_stride;
++ dst_v += dl + dt * dst_stride;
++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++ // Edge dup
++ if (dl != 0)
++ {
++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++ }
++ if (dr != 0)
++ {
++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++ }
++ w += dl + dr;
++ dst_u -= dl;
++ dst_v -= dl;
++
++ if (dt != 0)
++ {
++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++ }
++ if (db != 0)
++ {
++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++ }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++ const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++ if (is_c) {
++ x *= 2;
++ w *= 2;
++ }
++
++ for (int i = y; i != y + h; ++i) {
++ for (int j = x; j != x + w; ++j) {
++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++ char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++ if (j < 0 || i < 0)
++ printf("..%c", sep);
++ else
++ printf("%02x%c", *(const pixel*)p, sep);
++#else
++ if (j < 0 || i < 0)
++ printf("...%c", sep);
++ else
++ printf("%03x%c", *(const pixel*)p, sep);
++#endif
++ }
++ printf("\n");
++ }
++}
++
++
++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
++ const HEVCRpiInterPredEnv *const ipe_y,
++ const HEVCRpiInterPredEnv *const ipe_c)
++{
++ for (int c_idx = 0; c_idx < 2; ++c_idx)
++ {
++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++ shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++ unsigned int exit_n = 0;
++
++ if (ipe == NULL || !ipe->used) {
++ continue;
++ }
++
++ do {
++ for (unsigned int i = 0; i != ipe->n; ++i) {
++ const HEVCRpiInterPredQ * const q = ipe->q + i;
++ shader_track_t * const st = tracka + i;
++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++ for (;;) {
++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++ if (link == q->code_setup) {
++ if (c_idx == 0) {
++ // Luma
++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++ st->height = c->pic_h;
++ st->width = c->pic_w * PW;
++ st->stride1 = c->stride1;
++ st->stride2 = c->stride2;
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else {
++ // Chroma
++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++ st->height = c->pic_ch;
++ st->width = c->pic_cw * PW;
++ st->stride1 = c->stride1;
++ st->stride2 = c->stride2;
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ }
++ else if (link == s->qpu.y_pxx) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++ const int w1 = FFMIN(c->w, 8);
++ const int w2 = c->w - w1;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++ if (w2 > 0) {
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h + 7);
++ }
++
++ // wo[offset] = offset*2+1
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++ if (w2 > 0) {
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++ }
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_bxx) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h + 7);
++
++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_p00) {
++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++
++ // wo[offset] = offset*2+1
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++ st->last_l0 = &c->next_src1;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_b00) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ av_assert0(c->w <= 16 && c->h <= 64);
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h);
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h);
++
++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++ patch_y3, patch_y1, PATCH_STRIDE,
++ c->h, 0, 0, c->w);
++
++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++ 0, woff_b(s, c->wo2), 0, 0, c->w);
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_pxx) {
++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++ const int mx = fctom(c->coeffs_x);
++ const int my = fctom(c->coeffs_y);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l0 = &c->next_src;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_pxx_l1) {
++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++ const int mx = fctom(c->coeffs_x);
++ const int my = fctom(c->coeffs_y);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l1 = &c->next_src;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_bxx) {
++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++ const int mx1 = fctom(c->coeffs_x1);
++ const int my1 = fctom(c->coeffs_y1);
++ const int mx2 = fctom(c->coeffs_x2);
++ const int my2 = fctom(c->coeffs_y2);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72];
++ uint8_t patch_v1[PATCH_STRIDE * 72];
++ uint8_t patch_u2[PATCH_STRIDE * 72];
++ uint8_t patch_v2[PATCH_STRIDE * 72];
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, mx1, my1, c->w);
++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, mx1, my1, c->w);
++
++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++ c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++ c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == q->code_sync) {
++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++ break;
++ }
++ else if (link == q->code_exit) {
++ // We expect exit to occur without other sync
++ av_assert0(i == exit_n);
++ ++exit_n;
++ break;
++ }
++ else {
++ av_assert0(0);
++ }
++ }
++
++ st->qpu_mc_curr = cmd;
++ }
++ } while (exit_n == 0);
++ }
++}
++
++#undef FUNC
++#undef pixel
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,444 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
++# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
++.set USE_STACK, 0
++
++# Lines that fail to assemble start with #:
++# The script insert_magic_opcodes.sh inserts the machine code directly for these.
++# HEVC VPU Transform
++#
++# Transform matrix can be thought of as
++# output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++# (a b c d) (1 2 2 1)
++# (3 4 -4 -3)
++# (5 6 6 5)
++# (7 8 -8 -7)
++#
++# x=(a c)(1 2) = 1a+5c 2a+6c
++# (5 6)
++#
++# y=(b d)(3 4) = 3b+7d 4b+8d
++# (7 8)
++#
++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++# Final results are (u , v[::-1])
++#
++#
++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++# Apply the even matrix first and stop before rounding
++# Then apply the odd matrix in a full manner:
++#
++# First step is to compute partial products with the first input (16 cycles)
++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output
++# 2a 4b 6c 8d
++# 2a -4b 6c -8d
++# 1a -3b 5c -7d
++#
++# Second step is to sum partial products into final position (8 cycles)
++# 1a+3b+5c+7d
++# 2a+4b+6c+8d
++# 2a-4b+6c-8d
++# 1a-3b+5c-7d
++#
++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++# For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++ push r6-r15, lr # TODO cut down number of used registers
++ mov r14,r3 # coeffs32
++ mov r15,r4 # num32
++ mov r3, 16*2 # Stride of transMatrix2 in bytes
++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++ # Now use r0 to describe which matrix we are working on.
++ # Allows us to prefetch the next block of coefficients for efficiency.
++ mov r0,0 # This describes the location where we read our coefficients from
++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++ mov r7,16*16*2 # Total block size
++ mov r8,64*16 # Value used to swap from current to next VRF location
++ mov r4,64 # Constant used for rounding first pass
++ mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++ sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++
++ add r11,sp,64 # Space for 32 bytes before, and rounding
++ lsr r11,5
++ lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
++
++ lsr r10, r2, 16 # Number of compressed blocks stored in top short
++ extu r2,16
++ # At start of block r0,r1 point to the current block (that has already been loaded)
++ # r0 VRF location of current block
++ # r1 address of current block
++ # r2 number of 16*16 transforms to do
++ # r3 Stride of coefficients (==32)
++ # r4 TRANS_RND1 (64)
++ # r5 TRANS_RND2
++ # r6 temporary used inside col_trans16
++ # r7 16*16*2 total bytes in block
++ # r8 64*16 VRF switch locations
++ # r9 temporary in unpack_coeff for index
++ # r10 number of 16x16 transforms using compression
++ # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
++ # r12 temporary counter in unpack_coeff
++ # r13
++ # r14 Save information for 32 bit transform (coeffs location)
++ # r15 Save information for 32 bit transform (number of transforms)
++ cmp r2,0
++ beq done16x16s
++block_loop:
++ # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
++ cmp r10,0
++ mov r6, r1
++ beq not_compressed
++ sub r10, 1
++ bl unpack16x16
++not_compressed:
++ #mov r6,r1 # DEBUG without compress
++ vldh HX(0++,0)+r0,(r6 += r3) REP 16
++ #eor r0,r8
++ #add r1,r7
++ # Prefetch the next block
++ #bl unpack16x16
++ #vldh HX(0++,0)+r0,(r6 += r3) REP 16
++ #vmov HX(0++,0)+r0,0 REP 16 # DEBUG
++ #eor r0,r8
++ #sub r1,r7
++
++ # Transform the current block
++ bl col_trans_16
++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate
++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble?
++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position
++
++ bl col_trans_16
++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate
++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag)
++
++ # Save results - note there has been a transposition during the processing so we save columns
++ vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++ # Move onto next block
++ eor r0,r8
++ add r1,r7
++
++ addcmpbgt r2,-1,0,block_loop
++done16x16s:
++
++ add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++ # Now go and do any 32x32 transforms
++ b hevc_trans_32x32
++
++ pop r6-r15, pc
++# This returns a value in r6 that says where to load the data from.
++# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
++unpack16x16:
++# Clear out destination
++ vmov HX(0,0)+r0,0
++ mov r6, r11
++ vsth HX(0,0)+r0,(r6 += r3) REP 16
++ mov r5, r1 # Moving pointer to input coefficients
++unpack_outer_loop:
++ # Loop until we find the end
++ vldh HX(0,0)+r0,(r5) # TODO would prefetch help here while unpacking previous?
++ sub r6,r11,32
++ #add r6,pc,packed_data-$ # Packed data
++ vsth HX(0,0)+r0,(r6) # Store into packed data
++ mov r12,0
++unpack_loop:
++ ld r4,(r6)
++ add r6,r6,4
++ lsr r9,r4,16 # r9 is destination value
++ cmp r4,0 # {value,index}
++ extu r4,8
++ beq done_unpack
++ sth r9,(r11, r4)
++ addcmpblt r12,1,8,unpack_loop
++# # Read next 16
++ add r5,32
++ b unpack_outer_loop
++done_unpack:
++# # Set new load location
++ mov r6, r11
++ #add r6,pc,unpacked_data-$
++# # Restore constants
++ mov r4,64
++ mov r5,TRANS_RND2
++# pop r6-r15, pc
++ b lr
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++ add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++ # First compute partial products for a single column
++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++ # Then sum up the results and place back
++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++ addcmpblt r0,1,r6,col_trans_16_loop
++ sub r0,16 # put r0 back to its original value
++ b lr
++
++col_trans_odd_16:
++ add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++ # First compute partial products for a single column
++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++ # Then sum up the results and place back
++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++ addcmpblt r0,1,r6,col_trans_odd_16_loop
++ sub r0,16 # put r0 back to its original value
++ b lr
++
++# r1/r10 input pointer
++# r0,r4,r5,r6 free
++# r8/r9 output storage
++#
++# Store packed coefficients at r9-32
++# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
++unpack32x32:
++# Clear out destination
++ vmov HX(0,0),0
++ add r0, r9, 32*32*2 # Unpacked buffer
++ mov r4, 32
++ vsth HX(0,0),(r0 += r4) REP 64
++unpack_outer_loop32:
++ # Loop until we find the end
++ vldh HX(0,0),(r1) # TODO would prefetch help here while unpacking previous?
++ sub r6,r9,32
++ #add r6,pc,packed_data-$ # Packed data
++ vsth HX(0,0),(r6) # Store into packed data
++ mov r8,0
++unpack_loop32:
++ ld r4,(r6)
++ add r6,r6,4
++ lsr r5,r4,16 # r5 is destination value
++ cmp r4,0 # {value,index}
++ extu r4,10
++ beq done_unpack
++ sth r5,(r0, r4)
++ addcmpblt r8,1,8,unpack_loop32
++# # Read next 16
++ add r1,32
++ b unpack_outer_loop32
++done_unpack32:
++ b lr
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
++#
++# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
++hevc_trans_32x32:
++ mov r1,r14 # coeffs
++ mov r2,r15 # num
++ lsr r15,r15,16 # Number that are packed
++ extu r2,16 # Total number
++
++ # Fetch odd transform matrix
++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++ #add r0, 16*16*2
++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++ mov r7, 16*16*2 # Total block size
++
++.if USE_STACK
++ # Stack base allocation
++ sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
++ # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
++ add r8,sp,63
++ lsr r8,5
++ lsl r8,5
++.else
++#:version r8
++ .half 0x00e8 #AUTOINSERTED
++ btst r8,16
++#:add r8,pc,intermediate_results-$
++ .half 0xbfe8
++ .half intermediate_results-($-2)
++ beq on_vpu1
++ add r8,r8,32*32*2*2+16*2 # Move to secondary storage
++on_vpu1:
++.endif
++ mov r9,r8 # Backup of the temporary storage
++ mov r10,r1 # Backup of the coefficient buffer
++
++ cmp r2,0
++ beq done32x32s
++block_loop32:
++
++ # Transform the first 16 columns
++ mov r1,r10 # Input Coefficient buffer
++ mov r8,r9 # Output temporary storage
++ # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
++ cmp r2,r15
++ bgt not_compressed_32
++ bl unpack32x32
++ add r1,r9,32*32*2 # Uncompressed into temporary storage
++ mov r8,r9 # Transform into here
++not_compressed_32:
++ # COLUMN TRANSFORM
++ mov r4, 64 # Constant used for rounding first pass
++ mov r5, 9 # left shift used for rounding first pass
++
++ bl trans32
++ # Transform the second 16 columns
++ add r8,32*16*2
++ add r1,32
++ bl trans32
++
++ # ROW TRANSFORM
++ mov r4, TRANS_RND2 # Constant used for rounding second pass
++ mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++ mov r1,r9 # Input temporary storage
++ mov r8,r10 # Output Coefficient buffer
++ bl trans32
++ # Transform the second 16 columns
++ add r8,32*16*2
++ add r1,32
++ bl trans32
++
++ add r10, 32*32*2 # move onto next block of coefficients
++ addcmpbgt r2,-1,0,block_loop32
++done32x32s:
++
++.if USE_STACK
++ add sp,sp,32*32*4+64# Restore stack
++.endif
++
++ pop r6-r15, pc
++
++trans32:
++ push lr
++ # We can no longer afford the VRF space to do prefetching when doing 32x32
++ # Fetch the even rows
++ vldh HX(0++,0),(r1 += r3) REP 16
++ # Fetch the odd rows
++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++ # Transform the even rows using even matrix
++ mov r0, 0 # Even rows
++ bl col_trans_16
++
++ # Now transform the odd rows using odd matrix
++ mov r0, 64*16 # Odd rows
++ bl col_trans_odd_16
++
++ # Now apply butterfly to compute the first 16 results
++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
++ # 16bit results now in HX(48,32)
++ mov r0,r8
++ mov r6,32*2
++ vsth VX(48,32++),(r0+=r6) REP 16
++
++ # Now apply butterfly to compute the second 16 results (in reverse order)
++ vsub HY(63,0),HY(0 ,0),HY(16,0)
++ vsub HY(62,0),HY(1 ,0),HY(17,0)
++ vsub HY(61,0),HY(2 ,0),HY(18,0)
++ vsub HY(60,0),HY(3 ,0),HY(19,0)
++ vsub HY(59,0),HY(4 ,0),HY(20,0)
++ vsub HY(58,0),HY(5 ,0),HY(21,0)
++ vsub HY(57,0),HY(6 ,0),HY(22,0)
++ vsub HY(56,0),HY(7 ,0),HY(23,0)
++ vsub HY(55,0),HY(8 ,0),HY(24,0)
++ vsub HY(54,0),HY(9 ,0),HY(25,0)
++ vsub HY(53,0),HY(10,0),HY(26,0)
++ vsub HY(52,0),HY(11,0),HY(27,0)
++ vsub HY(51,0),HY(12,0),HY(28,0)
++ vsub HY(50,0),HY(13,0),HY(29,0)
++ vsub HY(49,0),HY(14,0),HY(30,0)
++ vsub HY(48,0),HY(15,0),HY(31,0)
++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
++ add r0,r8,32
++ vsth VX(48,32++),(r0+=r6) REP 16
++ pop pc
++
++.if USE_STACK == 0
++ .balign 32
++
++# .space directives generate 0's in the bin so avoid unnecessary padding by
++# just setting to appropriate value
++.equ intermediate_results, $+16*2
++
++# Layout goes:
++#
++#packed_buffer:
++# .space 16*2
++#intermediate_results:
++# .space 32*32*2
++#unpacked_buffer:
++# .space 32*32*2
++#
++#packed_buffer2:
++# .space 16*2
++#intermediate_results2:
++# .space 32*32*2
++#unpacked_buffer2:
++# .space 32*32*2
++.endif
++
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform10 [] = {
++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030
++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090
++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
++0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
++0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8
++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
++};
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform8 [] = {
++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030
++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090
++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
++0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
++0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8
++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
++};
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.c
+@@ -0,0 +1,6134 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Wassim Hamidouche
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++#include "libavutil/display.h"
++#include "libavutil/internal.h"
++#include "libavutil/mastering_display_metadata.h"
++#include "libavutil/md5.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/stereo3d.h"
++
++#include "decode.h"
++#include "bswapdsp.h"
++#include "bytestream.h"
++#include "golomb.h"
++#include "hevc.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_parse.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++#include "profiles.h"
++#include "hwconfig.h"
++
++#include "rpi_zc_frames.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "pthread.h"
++#include <stdatomic.h>
++
++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++ return a & ((1 << p) - 1);
++}
++# define av_mod_uintp2 av_mod_uintp2_c
++#endif
++
++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
++
++// UV & Y both have min 4x4 pred (no 2x2 chroma)
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
++#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
++
++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
++
++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
++
++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
++
++// Total cmds to allocate - allow for slack & setup
++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
++
++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
++
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8] = {
++ ENCODE_COEFFS( 0, 64, 0, 0),
++ ENCODE_COEFFS( 2, 58, 10, 2),
++ ENCODE_COEFFS( 4, 54, 16, 2),
++ ENCODE_COEFFS( 6, 46, 28, 4),
++ ENCODE_COEFFS( 4, 36, 36, 4),
++ ENCODE_COEFFS( 4, 28, 46, 6),
++ ENCODE_COEFFS( 2, 16, 54, 4),
++ ENCODE_COEFFS( 2, 10, 58, 2)
++};
++
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++ const uint8_t bit_depth;
++ const uint8_t n;
++ const int * const * setup_fns;
++ const int * const * sync_fns;
++ const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++ ipe_chan_info_t luma;
++ ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
++{
++ switch (ln)
++ {
++ default: // normally 0
++ *b = a;
++ break;
++ case 1:
++ a |= a << 8;
++ *(uint16_t *)b = a;
++ b += stride;
++ *(uint16_t *)b = a;
++ break;
++ case 2:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b = a;
++ b += stride;
++ *(uint32_t *)b = a;
++ b += stride;
++ *(uint32_t *)b = a;
++ b += stride;
++ *(uint32_t *)b = a;
++ break;
++ case 3:
++ {
++ unsigned int i;
++ uint64_t d;
++ a |= a << 8;
++ a |= a << 16;
++ d = ((uint64_t)a << 32) | a;
++ for (i = 0; i != 8; ++i, b += stride)
++ *(uint64_t *)b = d;
++ break;
++ }
++ case 4:
++ {
++ unsigned int i;
++ uint64_t d;
++ a |= a << 8;
++ a |= a << 16;
++ d = ((uint64_t)a << 32) | a;
++ for (i = 0; i != 16; ++i, b += stride)
++ {
++ *(uint64_t *)b = d;
++ *(uint64_t *)(b + 8) = d;
++ }
++ break;
++ }
++ }
++}
++
++// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3
++// (4 not required)
++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
++{
++ switch (ln)
++ {
++ default: // 0 or -1
++ *b_u = a;
++ *b_l = a;
++ break;
++ case 1:
++ a |= a << 8;
++ *(uint16_t *)b_u = a;
++ *(uint16_t *)b_l = a;
++ break;
++ case 2:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b_u = a;
++ *(uint32_t *)b_l = a;
++ break;
++ case 3:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b_u = a;
++ *(uint32_t *)(b_u + 4) = a;
++ *(uint32_t *)b_l = a;
++ *(uint32_t *)(b_l + 4) = a;
++ break;
++ case 4:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b_u = a;
++ *(uint32_t *)(b_u + 4) = a;
++ *(uint32_t *)(b_u + 8) = a;
++ *(uint32_t *)(b_u + 12) = a;
++ *(uint32_t *)b_l = a;
++ *(uint32_t *)(b_l + 4) = a;
++ *(uint32_t *)(b_l + 8) = a;
++ *(uint32_t *)(b_l + 12) = a;
++ break;
++ }
++}
++
++static void zap_cabac_stash(uint8_t * b, const int ln)
++{
++ switch (ln)
++ {
++ default: // 0
++ *b = 0;
++ break;
++ case 1:
++ *(uint16_t *)b = 0;
++ break;
++ case 2:
++ *(uint32_t *)b = 0;
++ break;
++ case 3:
++ *(uint32_t *)b = 0;
++ *(uint32_t *)(b + 4) = 0;
++ break;
++ }
++}
++
++
++
++// Set a small square block of bits in a bitmap
++// Bits must be aligned on their size boundry (which will be true of all split CBs)
++static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
++{
++ unsigned int n;
++ const unsigned int sh = (x & 7);
++
++ f += (x >> 3);
++
++ av_assert2(ln <= 3);
++ av_assert2((x & ((1 << ln) - 1)) == 0);
++
++ switch (ln)
++ {
++ default: // 1
++ f[0] |= 1 << sh;
++ break;
++ case 1: // 3 * 2
++ n = 3 << sh;
++ f[0] |= n;
++ f[stride] |= n;
++ break;
++ case 2: // 0xf * 4
++ n = 0xf << sh;
++ f[0] |= n;
++ f[stride] |= n;
++ f[stride * 2] |= n;
++ f[stride * 3] |= n;
++ break;
++ case 3: // 0xff * 8
++ for (n = 0; n != 8; ++n, f += stride)
++ *f = 0xff;
++ break;
++ }
++}
++
++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16
++ { // 8
++ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++ },
++ { // 9
++ .luma = {0},
++ .chroma = {0}
++ },
++ { // 10
++ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++ }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++ const unsigned int n = ici->n;
++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word
++
++ ipe->n = n;
++ ipe->max_fill = q1_size - ipe->min_gap;
++ for(unsigned int i = 0; i < n; i++) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ q->qpu_mc_curr = q->qpu_mc_base =
++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++ q->code_setup = qpu_fn(ici->setup_fns[i]);
++ q->code_sync = qpu_fn(ici->sync_fns[i]);
++ q->code_exit = qpu_fn(ici->exit_fns[i]);
++ }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
++{
++ av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++}
++
++// Unsigned Trivial MOD
++static inline unsigned int utmod(const unsigned int x, const unsigned int n)
++{
++ return x >= n ? x - n : x;
++}
++
++// returns pq->job_n++
++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
++{
++ unsigned int const x2 = pq->job_n;
++ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
++ return x2;
++}
++
++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
++{
++ pq->terminate = 0;
++ pq->job_n = 0;
++ pq->context = s;
++ pq->worker = worker;
++ pq->psem_out = psem_out;
++ pq->pass_n = n;
++ pq->started = 0;
++ sem_init(&pq->sem_in, 0, 0);
++}
++
++static void pass_queue_kill(HEVCRpiPassQueue * const pq)
++{
++ sem_destroy(&pq->sem_in);
++}
++
++static inline void rpi_sem_wait(sem_t * const sem)
++{
++ while (sem_wait(sem) != 0) {
++ av_assert0(errno == EINTR);
++ }
++}
++
++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
++{
++ sem_post(&pq->sem_in);
++}
++
++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ // Do the various passes - common with the worker code
++ for (unsigned int i = 0; i != RPI_PASSES; ++i) {
++ s->passq[i].worker(s, jb);
++ }
++}
++
++
++#if 0
++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
++{
++ int x;
++ sem_getvalue((sem_t *)&jbc->sem_out, &x);
++ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
++}
++#endif
++
++
++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJob * jb;
++ HEVCRpiJobGlobal * const jbg = jbc->jbg;
++
++ pthread_mutex_lock(&jbg->lock);
++ // Check local 1st
++ if ((jb = jbc->jb1) != NULL)
++ {
++ // Only 1 - very easy :-)
++ jbc->jb1 = NULL;
++ }
++ else
++ {
++ // Now look for global free chain
++ if ((jb = jbg->free1) != NULL)
++ {
++ // Found one - unlink it
++ jbg->free1 = jb->next;
++ jb->next = NULL;
++ }
++ else
++ {
++ // Out of places to look - wait for one to become free - add to Qs
++
++ // Global
++ // If "good" lc then add after the last "good" el in the chain
++ // otherwise add to the tail
++ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
++ {
++ // Add to end as we had to wait last time or wait Q empty
++ if ((lc->jw_prev = jbg->wait_tail) == NULL)
++ jbg->wait_head = lc;
++ else
++ lc->jw_prev->jw_next = lc;
++ lc->jw_next = NULL;
++ jbg->wait_tail = lc;
++ }
++ else
++ {
++ // This is a "good" lc that we need to poke into the middle
++ // of the Q
++ // We know that the Q isn't empty and there is at least one
++ // !last_progess_good el in it from the previous test
++
++ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
++
++ if (p == NULL)
++ {
++ // No current good els - add to head
++ lc->jw_next = jbg->wait_head;
++ jbg->wait_head = lc;
++ }
++ else
++ {
++ lc->jw_next = p->jw_next;
++ p->jw_next = lc;
++ }
++
++ lc->jw_next->jw_prev = lc;
++ lc->jw_prev = p;
++ }
++
++ // If "good" then we are now the last good waiting el
++ if (lc->last_progress_good)
++ jbg->wait_good = lc;
++
++ // Local
++ if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
++ jbc->lcw_head = lc;
++ else
++ lc->ljw_prev->ljw_next = lc;
++ lc->ljw_next = NULL;
++ jbc->lcw_tail = lc;
++ }
++ }
++
++ pthread_mutex_unlock(&jbg->lock);
++
++ if (jb == NULL) // Need to wait
++ {
++ rpi_sem_wait(&lc->jw_sem);
++ jb = lc->jw_job; // Set by free code
++ }
++
++ return jb;
++}
++
++
++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
++{
++ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock
++ HEVCRpiJobCtl * jbc = jb->jbc_local;
++ HEVCRpiLocalContext * lc = NULL;
++
++ pthread_mutex_lock(&jbg->lock);
++
++ if (jbc != NULL)
++ {
++ av_assert1(jbc->jb1 == NULL);
++
++ // Release to Local if nothing waiting there
++ if ((lc = jbc->lcw_head) == NULL)
++ jbc->jb1 = jb;
++ }
++ else
++ {
++ // Release to global if nothing waiting there
++ if ((lc = jbg->wait_head) == NULL)
++ {
++ jb->next = jbg->free1;
++ jbg->free1 = jb;
++ }
++ else
++ {
++ // ? seems somehow mildy ugly...
++ jbc = lc->context->jbc;
++ }
++ }
++
++ if (lc != NULL)
++ {
++ // Something was waiting
++
++ // Unlink
++ // Global
++ if (lc->jw_next == NULL)
++ jbg->wait_tail = lc->jw_prev;
++ else
++ lc->jw_next->jw_prev = lc->jw_prev;
++
++ if (lc->jw_prev == NULL)
++ jbg->wait_head = lc->jw_next;
++ else
++ lc->jw_prev->jw_next = lc->jw_next;
++
++ // Local
++ if (lc->ljw_next == NULL)
++ jbc->lcw_tail = lc->ljw_prev;
++ else
++ lc->ljw_next->ljw_prev = lc->ljw_prev;
++
++ if (lc->ljw_prev == NULL)
++ jbc->lcw_head = lc->ljw_next;
++ else
++ lc->ljw_prev->ljw_next = lc->ljw_next;
++
++ // Update good if required
++ if (jbg->wait_good == lc)
++ jbg->wait_good = lc->jw_prev;
++
++ // Prod
++ lc->jw_job = jb;
++ sem_post(&lc->jw_sem);
++ }
++
++ pthread_mutex_unlock(&jbg->lock);
++}
++
++static void job_lc_kill(HEVCRpiLocalContext * const lc)
++{
++ sem_destroy(&lc->jw_sem);
++}
++
++static void job_lc_init(HEVCRpiLocalContext * const lc)
++{
++ lc->jw_next = NULL;
++ lc->jw_prev = NULL;
++ lc->ljw_next = NULL;
++ lc->ljw_prev = NULL;
++ lc->jw_job = NULL;
++ sem_init(&lc->jw_sem, 0, 0);
++}
++
++// Returns:
++// 0 if we have waited for MV or expect to wait for recon
++// 1 if we haven't waited for MV & do not need to wait for recon
++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
++{
++ if (jb->waited) // reset by rpi_begin
++ return 0;
++ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
++ {
++ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
++ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
++ return 0;
++ }
++ return 1;
++}
++
++// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl *const jbc = s->jbc;
++ HEVCRpiJob * const jb = lc->jb0;
++
++ av_assert1(jb != NULL);
++
++ if (jb->ctu_ts_last < 0) {
++ return;
++ }
++
++ lc->last_progress_good = progress_good(s, jb);
++ jb->waited = !lc->last_progress_good;
++ lc->jb0 = NULL;
++
++ if (s->offload_recon)
++ {
++ pthread_mutex_lock(&jbc->in_lock);
++ jbc->offloadq[jbc->offload_in] = jb;
++ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
++ pthread_mutex_unlock(&jbc->in_lock);
++
++ pass_queue_submit_job(s->passq + 0); // Consumes job eventually
++ }
++ else
++ {
++ pass_queue_do_all(s, jb); // Consumes job before return
++ }
++}
++
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++//
++// Now safe against multiple callers - needed for tiles
++// "normal" and WPP will only call here one at a time
++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++
++ // It is legit for us to already have a job allocated - do nothing in this case
++ if (lc->jb0 != NULL)
++ return;
++
++ if (s->offload_recon)
++ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much
++
++ lc->jb0 = job_alloc(jbc, lc);
++
++ rpi_begin(s, lc->jb0, lc->ts);
++}
++
++// Free up a job without submission
++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++ HEVCRpiJob * const jb = lc->jb0;
++
++ if (jb == NULL) {
++ return;
++ }
++
++ lc->jb0 = NULL;
++
++ job_free(jbc, jb);
++
++ // If offload then poke sem_out too
++ if (s->offload_recon) {
++ sem_post(&jbc->sem_out);
++ }
++}
++
++
++// Call this to wait for all jobs to have completed at the end of a frame
++// Slightly icky as there is no clean way to wait for a sem to count up
++// Not reentrant - call on main thread only
++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++ int i = 0;
++
++ // We shouldn't reach here with an unsubmitted job
++ av_assert1(lc->jb0 == NULL);
++
++ // If no offload then there can't be anything to wait for
++ if (!s->offload_recon) {
++ return;
++ }
++
++ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
++ {
++ for (i = 0; i != RPI_MAX_JOBS; ++i) {
++ rpi_sem_wait(&jbc->sem_out);
++ }
++ for (i = 0; i != RPI_MAX_JOBS; ++i) {
++ sem_post(&jbc->sem_out);
++ }
++ }
++}
++
++static void * pass_worker(void *arg)
++{
++ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
++ HEVCRpiContext *const s = pq->context;
++
++ for (;;)
++ {
++ rpi_sem_wait(&pq->sem_in);
++
++ if (pq->terminate)
++ break;
++
++ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
++ // * should really set jb->passes_done here
++
++ sem_post(pq->psem_out);
++ }
++ return NULL;
++}
++
++static void pass_queues_start_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
++ pqs[i].started = 1;
++ }
++}
++
++static void pass_queues_term_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ pqs[i].terminate = 1;
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ if (pqs[i].started)
++ sem_post(&pqs[i].sem_in);
++ }
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ if (pqs[i].started) {
++ pthread_join(pqs[i].thread, NULL);
++ pqs[i].started = 0;
++ }
++ }
++}
++
++static void pass_queues_kill_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ pass_queue_kill(pqs + i);
++}
++
++
++static void worker_pic_free_one(HEVCRpiJob * const jb)
++{
++ // Free coeff stuff - allocation not the same for all buffers
++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++ if (cf->s[0].buf != NULL)
++ av_freep(&cf->mptr);
++ if (cf->s[2].buf != NULL)
++ gpu_free(&cf->gptr);
++ memset(cf, 0, sizeof(*cf));
++}
++
++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
++{
++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++ goto fail;
++ cf->s[2].buf = (int16_t *)cf->gptr.arm;
++ cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++ // Must be 64 byte aligned for our zero zapping code so over-allocate &
++ // round
++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++ goto fail;
++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++ return 0;
++
++fail:
++ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
++ worker_pic_free_one(jb);
++ return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++ unsigned int i;
++ for (i = 0; i != 4; ++i) {
++ cf->s[i].n = 0;
++#if RPI_COMPRESS_COEFFS
++ cf->s[i].packed = 1;
++ cf->s[i].packed_n = 0;
++#endif
++ }
++}
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
++{
++ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++ cfe->n += n;
++ return coeffs;
++}
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int val, const int field)
++{
++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
++ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
++ sem_t * sem = NULL;
++
++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++ if (((volatile int *)ref->tf.progress->data)[field] < val) {
++ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
++
++ av_assert1(pwait->req == -1 && pwait->next == NULL);
++ jb->waited = 1; // Remember that we had to wait for later scheduling
++
++ pwait->req = val;
++ pwait->next = NULL;
++ if (pstate->first == NULL)
++ pstate->first = pwait;
++ else
++ pstate->last->next = pwait;
++ pstate->last = pwait;
++ sem = &pwait->sem;
++ }
++ pthread_mutex_unlock(&pstate->lock);
++
++ if (sem != NULL) {
++ rpi_sem_wait(sem);
++ }
++ }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
++{
++ HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
++
++ ((int *)s->ref->tf.progress->data)[field] = val;
++
++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++ {
++ HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
++ HEVCRpiFrameProgressWait * pwait;
++
++ while ((pwait = *ppwait) != NULL) {
++ if (pwait->req > val)
++ {
++ ppwait = &pwait->next;
++ pstate->last = pwait;
++ }
++ else
++ {
++ *ppwait = pwait->next;
++ pwait->req = -1;
++ pwait->next = NULL;
++ sem_post(&pwait->sem);
++ }
++ }
++ }
++ pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
++{
++ pstate->first = NULL;
++ pstate->last = NULL;
++ pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++ pwait->req = -1;
++ pwait->next = NULL;
++ sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
++{
++ av_assert1(pstate->first == NULL);
++ pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++ sem_destroy(&pwait->sem);
++}
++
++
++/**
++ * NOTE: Each function hls_foo correspond to the function foo in the
++ * specification (HLS stands for High Level Syntax).
++ */
++
++/**
++ * Section 5.7
++ */
++
++// Realloc the entry point arrays
++static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
++{
++ if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
++ {
++ // Round up alloc to multiple of 32
++ int a = (n + 31) & ~31;
++
++ // We don't care about the previous contents so probably fastest to simply discard
++ av_freep(&sh->entry_point_offset);
++ av_freep(&sh->offset);
++ av_freep(&sh->size);
++
++ if (a != 0)
++ {
++ sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
++ sh->offset = av_malloc_array(a, sizeof(int));
++ sh->size = av_malloc_array(a, sizeof(int));
++
++ if (!sh->entry_point_offset || !sh->offset || !sh->size) {
++ sh->num_entry_point_offsets = 0;
++ sh->offsets_allocated = 0;
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ sh->offsets_allocated = a;
++ }
++
++ return 0;
++}
++
++/* free everything allocated by pic_arrays_init() */
++static void pic_arrays_free(HEVCRpiContext *s)
++{
++ av_freep(&s->sao);
++ av_freep(&s->deblock);
++
++ av_freep(&s->cabac_stash_up);
++ s->cabac_stash_left = NULL; // freed with _up
++
++ av_freep(&s->mvf_up);
++ av_freep(&s->mvf_left);
++
++ av_freep(&s->is_pcm);
++ av_freep(&s->is_intra_store);
++ s->is_intra = NULL;
++ av_freep(&s->rpl_tab);
++ s->rpl_tab_size = 0;
++
++ av_freep(&s->qp_y_tab);
++ av_freep(&s->tab_slice_address);
++ av_freep(&s->filter_slice_edges);
++
++ av_freep(&s->bs_horizontal);
++ s->bs_vertical = NULL; // freed with H
++ av_freep(&s->bsf_stash_left);
++ av_freep(&s->bsf_stash_up);
++
++ av_freep(&s->rpl_up);
++ av_freep(&s->rpl_left);
++
++ alloc_entry_points(&s->sh, 0);
++
++ av_buffer_pool_uninit(&s->col_mvf_pool);
++}
++
++/* allocate arrays that depend on frame dimensions */
++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
++{
++ const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
++ const unsigned int width = sps->width;
++ const unsigned int height = sps->height;
++ const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) *
++ ((height >> log2_min_cb_size) + 1);
++ const unsigned int ctb_count = sps->ctb_size;
++
++ {
++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
++ unsigned int h = ((height + 15) & ~15);
++
++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
++ }
++
++ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
++ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock));
++ if (!s->sao || !s->deblock)
++ goto fail;
++
++ s->cabac_stash_up = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
++ s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
++ if (s->cabac_stash_up == NULL)
++ goto fail;
++
++ // Round width up to max ctb size
++ s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++ // * Only needed if we have H tiles
++ s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++
++ // We can overread by 1 line & one byte in deblock so alloc & zero
++ // We don't need to zero the extra @ start of frame as it will never be
++ // written
++ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++ s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++ if (s->is_pcm == NULL || s->is_intra_store == NULL)
++ goto fail;
++
++ s->filter_slice_edges = av_mallocz(ctb_count);
++ s->tab_slice_address = av_malloc_array(ctb_count,
++ sizeof(*s->tab_slice_address));
++ s->qp_y_tab = av_malloc_array(pic_size_in_cb,
++ sizeof(*s->qp_y_tab));
++ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
++ goto fail;
++
++ s->bs_horizontal = av_mallocz(s->bs_size * 2);
++ s->bs_vertical = s->bs_horizontal + s->bs_size;
++ if (s->bs_horizontal == NULL)
++ goto fail;
++
++ s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
++ s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
++ if (s->rpl_left == NULL || s->rpl_up == NULL)
++ goto fail;
++
++ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
++ (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
++ goto fail;
++
++ s->col_mvf_stride = (width + 15) >> 4;
++ s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
++ av_buffer_allocz);
++ if (s->col_mvf_pool == NULL)
++ goto fail;
++
++ return 0;
++
++fail:
++ pic_arrays_free(s);
++ return AVERROR(ENOMEM);
++}
++
++static void default_pred_weight_table(HEVCRpiContext * const s)
++{
++ unsigned int i;
++ const unsigned int wt = 1 << QPU_MC_DENOM;
++ s->sh.luma_log2_weight_denom = 0;
++ s->sh.chroma_log2_weight_denom = 0;
++ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++ s->sh.luma_weight_l0[i] = wt;
++ s->sh.luma_offset_l0[i] = 0;
++ s->sh.chroma_weight_l0[i][0] = wt;
++ s->sh.chroma_weight_l0[i][1] = wt;
++ s->sh.chroma_offset_l0[i][0] = 0;
++ s->sh.chroma_offset_l0[i][1] = 0;
++ }
++ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++ s->sh.luma_weight_l1[i] = wt;
++ s->sh.luma_offset_l1[i] = 0;
++ s->sh.chroma_weight_l1[i][0] = wt;
++ s->sh.chroma_weight_l1[i][1] = wt;
++ s->sh.chroma_offset_l1[i][0] = 0;
++ s->sh.chroma_offset_l1[i][1] = 0;
++ }
++}
++
++static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
++ const unsigned int refs,
++ int16_t * luma_weight, int16_t * luma_offset,
++ int16_t * chroma_weight, int16_t * chroma_offset)
++{
++ unsigned int luma_flags;
++ unsigned int chroma_flags;
++ unsigned int i;
++ const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
++ const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
++ const unsigned int luma_weight_base = 1 << QPU_MC_DENOM;
++ const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM;
++ const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
++ const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
++
++ if (refs == 0)
++ return 0;
++
++ luma_flags = get_bits(gb, refs);
++ chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
++ i = 1 << (refs - 1);
++
++ do
++ {
++ if ((luma_flags & i) != 0)
++ {
++ const int delta_weight = get_se_golomb(gb);
++ const int offset = get_se_golomb(gb);
++ if (delta_weight < -128 || delta_weight > 127 ||
++ offset < -wp_offset_half_range || offset >= wp_offset_half_range)
++ {
++ return AVERROR_INVALIDDATA;
++ }
++ *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
++ *luma_offset++ = offset << wp_offset_bd_shift;
++ }
++ else
++ {
++ *luma_weight++ = luma_weight_base;
++ *luma_offset++ = 0;
++ }
++
++ if ((chroma_flags & i) != 0)
++ {
++ unsigned int j;
++ for (j = 0; j != 2; ++j)
++ {
++ const int delta_weight = get_se_golomb(gb);
++ const int delta_offset = get_se_golomb(gb);
++
++ if (delta_weight < -128 || delta_weight > 127 ||
++ delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
++ {
++ return AVERROR_INVALIDDATA;
++ }
++
++ *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
++ *chroma_offset++ = av_clip(
++ wp_offset_half_range + delta_offset -
++ ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
++ -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
++ }
++ }
++ else
++ {
++ *chroma_weight++ = chroma_weight_base;
++ *chroma_weight++ = chroma_weight_base;
++ *chroma_offset++ = 0;
++ *chroma_offset++ = 0;
++ }
++ } while ((i >>= 1) != 0);
++
++ return 0;
++}
++
++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
++{
++ int err;
++ const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
++ const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
++
++ if (luma_log2_weight_denom > 7 ||
++ chroma_log2_weight_denom > 7)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
++ luma_log2_weight_denom, chroma_log2_weight_denom);
++ return AVERROR_INVALIDDATA;
++ }
++
++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
++
++ if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
++ s->sh.luma_weight_l0, s->sh.luma_offset_l0,
++ s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
++ (err = get_weights(s, gb, s->sh.nb_refs[L1],
++ s->sh.luma_weight_l1, s->sh.luma_offset_l1,
++ s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
++ return err;
++ }
++
++ return 0;
++}
++
++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
++{
++ const HEVCRpiSPS *sps = s->ps.sps;
++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
++ int prev_delta_msb = 0;
++ unsigned int nb_sps = 0, nb_sh;
++ int i;
++
++ rps->nb_refs = 0;
++ if (!sps->long_term_ref_pics_present_flag)
++ return 0;
++
++ if (sps->num_long_term_ref_pics_sps > 0)
++ nb_sps = get_ue_golomb_long(gb);
++ nb_sh = get_ue_golomb_long(gb);
++
++ if (nb_sps > sps->num_long_term_ref_pics_sps)
++ return AVERROR_INVALIDDATA;
++ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
++ return AVERROR_INVALIDDATA;
++
++ rps->nb_refs = nb_sh + nb_sps;
++
++ for (i = 0; i < rps->nb_refs; i++) {
++ uint8_t delta_poc_msb_present;
++
++ if (i < nb_sps) {
++ uint8_t lt_idx_sps = 0;
++
++ if (sps->num_long_term_ref_pics_sps > 1)
++ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
++
++ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
++ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
++ } else {
++ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb);
++ rps->used[i] = get_bits1(gb);
++ }
++
++ delta_poc_msb_present = get_bits1(gb);
++ if (delta_poc_msb_present) {
++ int64_t delta = get_ue_golomb_long(gb);
++ int64_t poc;
++
++ if (i && i != nb_sps)
++ delta += prev_delta_msb;
++
++ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
++ if (poc != (int32_t)poc)
++ return AVERROR_INVALIDDATA;
++ rps->poc[i] = poc;
++ prev_delta_msb = delta;
++ }
++ }
++
++ return 0;
++}
++
++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
++ const HEVCRpiSPS *sps)
++{
++ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
++ const HEVCRpiWindow *ow = &sps->output_window;
++ unsigned int num = 0, den = 0;
++
++ avctx->pix_fmt = sps->pix_fmt;
++ avctx->coded_width = sps->width;
++ avctx->coded_height = sps->height;
++ avctx->width = sps->width - ow->left_offset - ow->right_offset;
++ avctx->height = sps->height - ow->top_offset - ow->bottom_offset;
++ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
++ avctx->profile = sps->ptl.general_ptl.profile_idc;
++ avctx->level = sps->ptl.general_ptl.level_idc;
++
++ ff_set_sar(avctx, sps->vui.sar);
++
++ if (sps->vui.video_signal_type_present_flag)
++ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
++ : AVCOL_RANGE_MPEG;
++ else
++ avctx->color_range = AVCOL_RANGE_MPEG;
++
++ if (sps->vui.colour_description_present_flag) {
++ avctx->color_primaries = sps->vui.colour_primaries;
++ avctx->color_trc = sps->vui.transfer_characteristic;
++ avctx->colorspace = sps->vui.matrix_coeffs;
++ } else {
++ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
++ avctx->color_trc = AVCOL_TRC_UNSPECIFIED;
++ avctx->colorspace = AVCOL_SPC_UNSPECIFIED;
++ }
++
++ if (vps->vps_timing_info_present_flag) {
++ num = vps->vps_num_units_in_tick;
++ den = vps->vps_time_scale;
++ } else if (sps->vui.vui_timing_info_present_flag) {
++ num = sps->vui.vui_num_units_in_tick;
++ den = sps->vui.vui_time_scale;
++ }
++
++ if (num != 0 && den != 0)
++ av_reduce(&avctx->framerate.den, &avctx->framerate.num,
++ num, den, 1 << 30);
++}
++
++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
++
++ // Admit to no h/w formats
++
++ *fmt++ = sps->pix_fmt;
++ *fmt = AV_PIX_FMT_NONE;
++
++ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
++}
++
++static int is_sps_supported(const HEVCRpiSPS * const sps)
++{
++ return av_rpi_is_sand_format(sps->pix_fmt) &&
++ sps->width <= HEVC_RPI_MAX_WIDTH &&
++ sps->height <= HEVC_RPI_MAX_HEIGHT;
++}
++
++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
++ const enum AVPixelFormat pix_fmt)
++{
++ int ret;
++
++ pic_arrays_free(s);
++ s->ps.sps = NULL;
++ s->ps.vps = NULL;
++
++ if (sps == NULL)
++ return 0;
++
++ if (!is_sps_supported(sps))
++ return AVERROR_DECODER_NOT_FOUND;
++
++ ret = pic_arrays_init(s, sps);
++ if (ret < 0)
++ goto fail;
++
++ export_stream_params(s->avctx, &s->ps, sps);
++
++ s->avctx->pix_fmt = pix_fmt;
++
++ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth);
++ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
++
++ // * We don't support cross_component_prediction_enabled_flag but as that
++ // must be 0 unless we have 4:4:4 there is no point testing for it as we
++ // only deal with sand which is never 4:4:4
++ // [support wouldn't be hard]
++
++ rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++
++ av_freep(&s->sao_pixel_buffer_h[0]);
++ av_freep(&s->sao_pixel_buffer_v[0]);
++
++ if (sps->sao_enabled)
++ {
++ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
++ unsigned int c_idx;
++ size_t vsize[3] = {0};
++ size_t hsize[3] = {0};
++
++ for(c_idx = 0; c_idx < c_count; c_idx++) {
++ int w = sps->width >> ctx_hshift(s, c_idx);
++ int h = sps->height >> ctx_vshift(s, c_idx);
++ // ctb height & width are a min of 8 so this must a multiple of 16
++ // so no point rounding up!
++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++ }
++
++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++ // when we have plaited chroma
++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++ }
++
++ s->ps.sps = sps;
++ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++
++ return 0;
++
++fail:
++ pic_arrays_free(s);
++ s->ps.sps = NULL;
++ return ret;
++}
++
++static inline int qp_offset_valid(const int qp_offset)
++{
++ return qp_offset >= -12 && qp_offset <= 12;
++}
++
++static int hls_slice_header(HEVCRpiContext * const s)
++{
++ GetBitContext * const gb = &s->HEVClc->gb;
++ RpiSliceHeader * const sh = &s->sh;
++ int i, ret;
++
++ // Coded parameters
++ sh->first_slice_in_pic_flag = get_bits1(gb);
++ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ if (IS_IDR(s))
++ ff_hevc_rpi_clear_refs(s);
++ }
++ sh->no_output_of_prior_pics_flag = 0;
++ if (IS_IRAP(s))
++ sh->no_output_of_prior_pics_flag = get_bits1(gb);
++
++ sh->pps_id = get_ue_golomb_long(gb);
++ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
++ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
++ return AVERROR_INVALIDDATA;
++ }
++ if (!sh->first_slice_in_pic_flag &&
++ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
++ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
++ return AVERROR_INVALIDDATA;
++ }
++ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
++ sh->no_output_of_prior_pics_flag = 1;
++
++ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
++ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
++ const HEVCRpiSPS *last_sps = s->ps.sps;
++ enum AVPixelFormat pix_fmt;
++
++ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
++ if (sps->width != last_sps->width || sps->height != last_sps->height ||
++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
++ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
++ sh->no_output_of_prior_pics_flag = 0;
++ }
++ ff_hevc_rpi_clear_refs(s);
++
++ ret = set_sps(s, sps, sps->pix_fmt);
++ if (ret < 0)
++ return ret;
++
++ pix_fmt = get_format(s, sps);
++ if (pix_fmt < 0)
++ return pix_fmt;
++
++// ret = set_sps(s, sps, pix_fmt);
++// if (ret < 0)
++// return ret;
++
++ s->avctx->pix_fmt = pix_fmt;
++
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ }
++
++ sh->dependent_slice_segment_flag = 0;
++ if (!sh->first_slice_in_pic_flag) {
++ int slice_address_length;
++
++ if (s->ps.pps->dependent_slice_segments_enabled_flag)
++ sh->dependent_slice_segment_flag = get_bits1(gb);
++
++ slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
++ sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
++ if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid slice segment address: %u.\n",
++ sh->slice_segment_addr);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (!sh->dependent_slice_segment_flag) {
++ sh->slice_addr = sh->slice_segment_addr;
++ s->slice_idx++;
++ }
++ } else {
++ sh->slice_segment_addr = sh->slice_addr = 0;
++ s->slice_idx = 0;
++ s->slice_initialized = 0;
++ }
++
++ if (!sh->dependent_slice_segment_flag) {
++ s->slice_initialized = 0;
++
++ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
++ skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
++
++ sh->slice_type = get_ue_golomb_long(gb);
++ if (!(sh->slice_type == HEVC_SLICE_I ||
++ sh->slice_type == HEVC_SLICE_P ||
++ sh->slice_type == HEVC_SLICE_B)) {
++ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
++ sh->slice_type);
++ return AVERROR_INVALIDDATA;
++ }
++ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
++ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // when flag is not present, picture is inferred to be output
++ sh->pic_output_flag = 1;
++ if (s->ps.pps->output_flag_present_flag)
++ sh->pic_output_flag = get_bits1(gb);
++
++ if (s->ps.sps->separate_colour_plane_flag)
++ sh->colour_plane_id = get_bits(gb, 2);
++
++ if (!IS_IDR(s)) {
++ int poc, pos;
++
++ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
++ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
++ if (!sh->first_slice_in_pic_flag && poc != s->poc) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return AVERROR_INVALIDDATA;
++ poc = s->poc;
++ }
++ s->poc = poc;
++
++ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
++ pos = get_bits_left(gb);
++ if (!sh->short_term_ref_pic_set_sps_flag) {
++ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
++ if (ret < 0)
++ return ret;
++
++ sh->short_term_rps = &sh->slice_rps;
++ } else {
++ int numbits, rps_idx;
++
++ if (!s->ps.sps->nb_st_rps) {
++ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
++ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
++ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
++ }
++ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++ pos = get_bits_left(gb);
++ ret = decode_lt_rps(s, &sh->long_term_rps, gb);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return AVERROR_INVALIDDATA;
++ }
++ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++ if (s->ps.sps->sps_temporal_mvp_enabled_flag)
++ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
++ else
++ sh->slice_temporal_mvp_enabled_flag = 0;
++ } else {
++ s->sh.short_term_rps = NULL;
++ s->poc = 0;
++ }
++
++ /* 8.3.1 */
++ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
++ s->nal_unit_type != HEVC_NAL_TRAIL_N &&
++ s->nal_unit_type != HEVC_NAL_TSA_N &&
++ s->nal_unit_type != HEVC_NAL_STSA_N &&
++ s->nal_unit_type != HEVC_NAL_RADL_N &&
++ s->nal_unit_type != HEVC_NAL_RADL_R &&
++ s->nal_unit_type != HEVC_NAL_RASL_N &&
++ s->nal_unit_type != HEVC_NAL_RASL_R)
++ s->pocTid0 = s->poc;
++
++ if (s->ps.sps->sao_enabled) {
++ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
++ if (ctx_cfmt(s) != 0) {
++ sh->slice_sample_adaptive_offset_flag[1] =
++ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
++ }
++ } else {
++ sh->slice_sample_adaptive_offset_flag[0] = 0;
++ sh->slice_sample_adaptive_offset_flag[1] = 0;
++ sh->slice_sample_adaptive_offset_flag[2] = 0;
++ }
++
++ sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
++ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
++ int nb_refs;
++
++ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
++
++ if (get_bits1(gb)) { // num_ref_idx_active_override_flag
++ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
++ }
++ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
++ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
++ sh->nb_refs[L0], sh->nb_refs[L1]);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->rpl_modification_flag[0] = 0;
++ sh->rpl_modification_flag[1] = 0;
++ nb_refs = ff_hevc_rpi_frame_nb_refs(s);
++ if (!nb_refs) {
++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
++ sh->rpl_modification_flag[0] = get_bits1(gb);
++ if (sh->rpl_modification_flag[0]) {
++ for (i = 0; i < sh->nb_refs[L0]; i++)
++ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B) {
++ sh->rpl_modification_flag[1] = get_bits1(gb);
++ if (sh->rpl_modification_flag[1] == 1)
++ for (i = 0; i < sh->nb_refs[L1]; i++)
++ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
++ }
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->mvd_l1_zero_flag = get_bits1(gb);
++
++ if (s->ps.pps->cabac_init_present_flag)
++ sh->cabac_init_flag = get_bits1(gb);
++ else
++ sh->cabac_init_flag = 0;
++
++ sh->collocated_ref_idx = 0;
++ if (sh->slice_temporal_mvp_enabled_flag) {
++ sh->collocated_list = L0;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->collocated_list = !get_bits1(gb);
++
++ if (sh->nb_refs[sh->collocated_list] > 1) {
++ sh->collocated_ref_idx = get_ue_golomb_long(gb);
++ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid collocated_ref_idx: %d.\n",
++ sh->collocated_ref_idx);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++ }
++
++ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) ||
++ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
++ {
++ if ((ret = pred_weight_table(s, gb)) != 0)
++ return ret;
++ }
++ else
++ {
++ // Give us unit weights
++ default_pred_weight_table(s);
++ }
++
++ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid number of merging MVP candidates: %d.\n",
++ sh->max_num_merge_cand);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ sh->slice_qp_delta = get_se_golomb(gb);
++
++ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
++ sh->slice_cb_qp_offset = get_se_golomb(gb);
++ sh->slice_cr_qp_offset = get_se_golomb(gb);
++ if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
++ !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
++ !qp_offset_valid(sh->slice_cr_qp_offset) ||
++ !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
++ sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
++ s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
++ return AVERROR_INVALIDDATA;
++ }
++ } else
++ {
++ sh->slice_cb_qp_offset = 0;
++ sh->slice_cr_qp_offset = 0;
++ }
++
++ if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
++ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
++ else
++ sh->cu_chroma_qp_offset_enabled_flag = 0;
++
++ if (s->ps.pps->deblocking_filter_control_present_flag) {
++ int deblocking_filter_override_flag = 0;
++
++ if (s->ps.pps->deblocking_filter_override_enabled_flag)
++ deblocking_filter_override_flag = get_bits1(gb);
++
++ if (deblocking_filter_override_flag) {
++ sh->disable_deblocking_filter_flag = get_bits1(gb);
++ if (!sh->disable_deblocking_filter_flag) {
++ int beta_offset_div2 = get_se_golomb(gb);
++ int tc_offset_div2 = get_se_golomb(gb) ;
++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
++ tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid deblock filter offsets: %d, %d\n",
++ beta_offset_div2, tc_offset_div2);
++ return AVERROR_INVALIDDATA;
++ }
++ sh->beta_offset = beta_offset_div2 * 2;
++ sh->tc_offset = tc_offset_div2 * 2;
++ }
++ } else {
++ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
++ sh->beta_offset = s->ps.pps->beta_offset;
++ sh->tc_offset = s->ps.pps->tc_offset;
++ }
++ } else {
++ sh->disable_deblocking_filter_flag = 0;
++ sh->beta_offset = 0;
++ sh->tc_offset = 0;
++ }
++
++ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
++ (sh->slice_sample_adaptive_offset_flag[0] ||
++ sh->slice_sample_adaptive_offset_flag[1] ||
++ !sh->disable_deblocking_filter_flag)) {
++ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++ } else {
++ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
++ }
++ sh->no_dblk_boundary_flags =
++ (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
++ BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
++ (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
++ BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
++
++
++ } else if (!s->slice_initialized) {
++ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->num_entry_point_offsets = 0;
++ sh->offload_wpp = 0;
++ sh->offload_tiles = 0;
++
++ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
++ unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
++ // It would be possible to bound this tighter but this here is simpler
++ if (num_entry_point_offsets > get_bits_left(gb)) {
++ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->num_entry_point_offsets = num_entry_point_offsets;
++ if (sh->num_entry_point_offsets > 0) {
++ int offset_len = get_ue_golomb_long(gb) + 1;
++
++ if (offset_len < 1 || offset_len > 32) {
++ sh->num_entry_point_offsets = 0;
++ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
++ return ret;
++ }
++
++ for (i = 0; i < sh->num_entry_point_offsets; i++) {
++ uint32_t val_minus1 = get_bits_long(gb, offset_len);
++ if (val_minus1 > (1 << 28))
++ {
++ // We can declare offsets of > 2^28 bad without loss of generality
++ // Will check actual bounds wrt NAL later, but this keeps
++ // the values within bounds we can deal with easily
++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
++ return AVERROR_INVALIDDATA;
++ }
++ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
++ }
++
++ // Do we want to offload this
++ if (s->threads_type != 0)
++ {
++ sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
++ s->ps.pps->num_tile_columns > 1;
++ // * We only cope with WPP in a single column
++ // Probably want to deal with that case as tiles rather than WPP anyway
++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly
++ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->ps.pps->num_tile_columns == 1;
++ }
++ }
++ }
++
++ if (s->ps.pps->slice_header_extension_present_flag) {
++ unsigned int length = get_ue_golomb_long(gb);
++ if (length*8LL > get_bits_left(gb)) {
++ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
++ return AVERROR_INVALIDDATA;
++ }
++ for (i = 0; i < length; i++)
++ skip_bits(gb, 8); // slice_header_extension_data_byte
++ }
++
++ // Inferred parameters
++ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
++ if (sh->slice_qp > 51 ||
++ sh->slice_qp < -s->ps.sps->qp_bd_offset) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "The slice_qp %d is outside the valid range "
++ "[%d, 51].\n",
++ sh->slice_qp,
++ -s->ps.sps->qp_bd_offset);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (get_bits_left(gb) < 0) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Overread slice header by %d bits\n", -get_bits_left(gb));
++ return AVERROR_INVALIDDATA;
++ }
++
++ s->slice_initialized = 1;
++ return 0;
++}
++
++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
++{
++ RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
++ int c_idx, i;
++
++ if (s->sh.slice_sample_adaptive_offset_flag[0] ||
++ s->sh.slice_sample_adaptive_offset_flag[1]) {
++ if ((lc->ctb_avail & AVAIL_L) != 0)
++ {
++ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++ if (sao_merge_left_flag) {
++ *sao = sao[-1];
++ return;
++ }
++ }
++ if ((lc->ctb_avail & AVAIL_U) != 0)
++ {
++ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++ if (sao_merge_up_flag) {
++ *sao = sao[-(int)s->ps.sps->ctb_width];
++ return;
++ }
++ }
++ }
++
++ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
++ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
++ s->ps.pps->log2_sao_offset_scale_chroma;
++ int offset_abs[4];
++ char offset_sign[4] = {0};
++
++ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
++ sao->type_idx[c_idx] = SAO_NOT_APPLIED;
++ continue;
++ }
++
++ if (c_idx == 2) {
++ sao->type_idx[2] = sao->type_idx[1];
++ sao->eo_class[2] = sao->eo_class[1];
++ } else {
++ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
++ }
++
++ // ** Could use BY22 here quite plausibly - this is all bypass stuff
++ // though only per CTB so not very timing critical
++
++ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
++ continue;
++
++ for (i = 0; i < 4; i++)
++ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
++
++ if (sao->type_idx[c_idx] == SAO_BAND) {
++ for (i = 0; i < 4; i++) {
++ if (offset_abs[i] != 0)
++ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
++ }
++ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
++ } else if (c_idx != 2) {
++ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
++ }
++
++ // Inferred parameters
++ sao->offset_val[c_idx][0] = 0;
++ for (i = 0; i < 4; i++) {
++ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
++ if (sao->type_idx[c_idx] == SAO_EDGE) {
++ if (i > 1)
++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++ } else if (offset_sign[i]) {
++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++ }
++ }
++ }
++}
++
++#if 0
++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
++ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); // 0..4
++
++ if (log2_res_scale_abs_plus1 != 0) {
++ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
++ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
++ (1 - 2 * res_scale_sign_flag);
++ } else {
++ lc->tu.res_scale_val = 0;
++ }
++
++
++ return 0;
++}
++#endif
++
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
++{
++ return jb->intra.cmds + jb->intra.n++;
++}
++
++#define A0(x, y, U, L, UL, UR, DL) \
++ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
++
++#define A1(x, y, U, L, UL, UR, DL) \
++ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 )
++
++#define A2(x, y, U, L, UL, UR, DL) \
++ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 )
++
++#define A3(x, y, U, L, UL, UR, DL) \
++ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 )
++
++#define A4(x, y, U, L, UL, UR, DL) \
++ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 )
++
++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
++{
++ const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
++ const unsigned int tb_x = x & ~ctb_mask;
++ const unsigned int tb_y = y & ~ctb_mask;
++ const unsigned int ctb_avail = lc->ctb_avail;
++
++ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
++
++ unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
++
++ // This deals with both the U & L edges
++ if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
++ f |= AVAIL_UL;
++
++ if (x + w < lc->end_of_ctb_x)
++ f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
++ else if (tb_y == 0)
++ f |= (ctb_avail & AVAIL_UR);
++#if AVAIL_S_U - AVAIL_S_UR < 0
++#error Shift problem
++#endif
++
++ // Never any D if Y beyond eoctb
++ if (y + h < lc->end_of_ctb_y)
++ f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
++#if AVAIL_S_DL - AVAIL_S_L < 0
++#error Shift problem
++#endif
++
++// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
++// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
++// lc->end_of_ctb_x, lc->end_of_ctb_y);
++
++ return f;
++}
++
++#undef A0
++#undef A1
++#undef A2
++#undef A3
++#undef A4
++
++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
++ unsigned int avail)
++{
++ // If rpi_enabled then sand - U & V done on U call
++ if (c_idx <= 1)
++ {
++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++ cmd->type = RPI_PRED_INTRA + c_idx;
++ cmd->size = log2_trafo_size;
++ cmd->avail = avail;
++ cmd->i_pred.x = x0;
++ cmd->i_pred.y = y0;
++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++
++// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
++ }
++}
++
++#define CBF_CB0_S 0
++#define CBF_CB1_S 1 // CB1 must be CB0 + 1
++#define CBF_CR0_S 2
++#define CBF_CR1_S 3
++
++#define CBF_CB0 (1 << CBF_CB0_S)
++#define CBF_CR0 (1 << CBF_CR0_S)
++#define CBF_CB1 (1 << CBF_CB1_S)
++#define CBF_CR1 (1 << CBF_CR1_S)
++
++// * Only good for chroma_idx == 1
++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
++ const unsigned int blk_idx, const int cbf_luma,
++ const unsigned int cbf_chroma)
++{
++ const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
++ const unsigned int x0_c = x0 & ~7;
++ const unsigned int y0_c = y0 & ~7;
++
++ enum ScanType scan_idx = SCAN_DIAG;
++ enum ScanType scan_idx_c = SCAN_DIAG;
++
++ if (lc->cu.pred_mode == MODE_INTRA)
++ {
++ const unsigned int trafo_size = 1 << log2_trafo_size;
++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
++
++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
++
++ if (log2_trafo_size > 2)
++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
++ else if (blk_idx == 3)
++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
++
++ if (log2_trafo_size < 4) {
++ if (lc->tu.intra_pred_mode >= 6 &&
++ lc->tu.intra_pred_mode <= 14) {
++ scan_idx = SCAN_VERT;
++ } else if (lc->tu.intra_pred_mode >= 22 &&
++ lc->tu.intra_pred_mode <= 30) {
++ scan_idx = SCAN_HORIZ;
++ }
++
++ if (lc->tu.intra_pred_mode_c >= 6 &&
++ lc->tu.intra_pred_mode_c <= 14) {
++ scan_idx_c = SCAN_VERT;
++ } else if (lc->tu.intra_pred_mode_c >= 22 &&
++ lc->tu.intra_pred_mode_c <= 30) {
++ scan_idx_c = SCAN_HORIZ;
++ }
++ }
++ }
++
++ if (!cbf_luma && cbf_chroma == 0)
++ return 0;
++
++ if (lc->tu.is_cu_qp_delta_wanted)
++ {
++ const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
++ const unsigned int cb_mask = ~0U << log2_cb_size;
++
++ if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
++ qp_delta > (25 + (s->ps.sps->qp_bd_offset >> 1)))
++ {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "The cu_qp_delta %d is outside the valid range "
++ "[%d, %d].\n",
++ qp_delta,
++ -(26 + (s->ps.sps->qp_bd_offset >> 1)),
++ (25 + (s->ps.sps->qp_bd_offset >> 1)));
++ return AVERROR_INVALIDDATA;
++ }
++
++ lc->tu.is_cu_qp_delta_wanted = 0;
++ lc->tu.cu_qp_delta = qp_delta;
++ ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
++ }
++
++ // * Not main profile & untested due to no conform streams
++ if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
++ !lc->cu.cu_transquant_bypass_flag) {
++ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
++ if (cu_chroma_qp_offset_flag) {
++ int cu_chroma_qp_offset_idx = 0;
++ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
++ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
++ }
++ lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
++ lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
++ }
++ lc->tu.cu_chroma_qp_offset_wanted = 0;
++ }
++
++ if (cbf_luma)
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
++
++ if (log2_trafo_size > 2 || blk_idx == 3)
++ {
++ if ((cbf_chroma & CBF_CB0) != 0)
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++ log2_trafo_size_c, scan_idx_c, 1);
++ if ((cbf_chroma & CBF_CR0) != 0)
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++ log2_trafo_size_c, scan_idx_c, 2);
++ }
++
++ return 0;
++}
++
++static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
++{
++ set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
++}
++
++
++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_trafo_size,
++ const unsigned int trafo_depth, const unsigned int blk_idx,
++ const unsigned int cbf_c0)
++{
++ // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
++ unsigned int cbf_c1 = cbf_c0;
++ int split_transform_flag;
++ int ret;
++
++ if (lc->cu.intra_split_flag) {
++ if (trafo_depth == 1) {
++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
++ if (ctx_cfmt(s) == 3) {
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx];
++ } else {
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
++ }
++ }
++ } else {
++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0];
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
++ }
++
++ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
++ log2_trafo_size > s->ps.sps->log2_min_tb_size &&
++ trafo_depth < lc->cu.max_trafo_depth &&
++ !(lc->cu.intra_split_flag && trafo_depth == 0))
++ {
++ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
++ } else {
++ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
++ lc->cu.pred_mode == MODE_INTER &&
++ lc->cu.part_mode != PART_2Nx2N &&
++ trafo_depth == 0;
++
++ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
++ (lc->cu.intra_split_flag && trafo_depth == 0) ||
++ inter_split;
++ }
++
++ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
++ {
++ const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
++ cbf_c1 = 0;
++
++ if ((cbf_c0 & CBF_CB0) != 0)
++ {
++ cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
++ if (wants_c1)
++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
++ }
++
++ if ((cbf_c0 & CBF_CR0) != 0)
++ {
++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
++ if (wants_c1)
++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
++ }
++ }
++
++ if (split_transform_flag) {
++ const int trafo_size_split = 1 << (log2_trafo_size - 1);
++ const int x1 = x0 + trafo_size_split;
++ const int y1 = y0 + trafo_size_split;
++
++#define SUBDIVIDE(x, y, idx) \
++do { \
++ ret = hls_transform_tree(s, lc, x, y, \
++ log2_trafo_size - 1, trafo_depth + 1, idx, \
++ cbf_c1); \
++ if (ret < 0) \
++ return ret; \
++} while (0)
++
++ SUBDIVIDE(x0, y0, 0);
++ SUBDIVIDE(x1, y0, 1);
++ SUBDIVIDE(x0, y1, 2);
++ SUBDIVIDE(x1, y1, 3);
++
++#undef SUBDIVIDE
++ } else {
++ // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
++ // trafo_size == 2 with depth == 0 the issue is moot
++ const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
++ ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
++
++ ret = hls_transform_unit(s, lc, x0, y0,
++ log2_trafo_size + trafo_depth, log2_trafo_size,
++ blk_idx, cbf_luma, cbf_c1);
++ if (ret < 0)
++ return ret;
++
++ if (!s->sh.disable_deblocking_filter_flag) {
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
++ }
++ }
++ return 0;
++}
++
++
++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++{
++ GetBitContext gb;
++ int ret;
++
++ ret = init_get_bits(&gb, pcm, length);
++ if (ret < 0)
++ return ret;
++
++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
++ frame_stride1(s->frame, 0),
++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
++ s->frame->linesize[1],
++ cb_size >> ctx_hshift(s, 1),
++ cb_size >> ctx_vshift(s, 1),
++ &gb, s->ps.sps->pcm.bit_depth_chroma);
++
++ return 0;
++}
++
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++ return x << (y * 2);
++}
++
++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
++{
++ // Length in bits
++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
++
++ const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
++
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++
++ // Copy coeffs
++ {
++ const int blen = (length + 7) >> 3;
++ // Round allocated bytes up to nearest 32 to avoid alignment confusion
++ // Allocation is in int16_t s
++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++ // sample this rounding doesn't affect the total size we need to allocate for
++ // the coeff buffer
++ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
++ memcpy(coeffs, pcm, blen);
++
++ // Our coeff stash assumes that any partially allocated 64byte lump
++ // is zeroed so make that true.
++ {
++ uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++ if ((-(intptr_t)eopcm & 63) != 0)
++ memset(eopcm, 0, -(intptr_t)eopcm & 63);
++ }
++
++ // Add command
++ {
++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++ cmd->type = RPI_PRED_I_PCM;
++ cmd->size = log2_cb_size;
++ cmd->i_pcm.src = coeffs;
++ cmd->i_pcm.x = x0;
++ cmd->i_pcm.y = y0;
++ cmd->i_pcm.src_len = length;
++ }
++ return 0;
++ }
++}
++
++
++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
++ const MvXY xy, const int y0, const int height)
++{
++ if (s->threads_type != 0) {
++ const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
++
++ // Progress has to be attached to current job as the actual wait
++ // is in worker_core which can't use lc
++ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
++ if (*pr < y) {
++ *pr = y;
++ }
++ }
++}
++
++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0, const int nPbW,
++ const int nPbH,
++ HEVCRpiMvField * const mv)
++{
++ enum InterPredIdc inter_pred_idc = PRED_L0;
++ int mvp_flag;
++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
++
++ mv->pred_flag = 0;
++ if (s->sh.slice_type == HEVC_SLICE_B)
++ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
++
++ if (inter_pred_idc != PRED_L1) {
++ MvXY mvd;
++
++ if (s->sh.nb_refs[L0])
++ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
++
++ mv->pred_flag = PF_L0;
++ mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++ mv, mvp_flag, 0);
++ mv->xy[0] = mvxy_add(mv->xy[0], mvd);
++ }
++
++ if (inter_pred_idc != PRED_L0) {
++ MvXY mvd = 0;
++
++ if (s->sh.nb_refs[L1])
++ mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
++
++ if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
++ mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++
++ mv->pred_flag += PF_L1;
++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++ mv, mvp_flag, 1);
++ mv->xy[1] = mvxy_add(mv->xy[1], mvd);
++ }
++}
++
++
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++{
++ HEVCRpiInterPredQ * yp = NULL;
++ HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
++ const unsigned int max_fill = ipe->max_fill;
++ unsigned int load = UINT_MAX;
++
++ for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
++ // We will always have enough room between the Qs but if we are
++ // running critically low due to poor scheduling then use fill size
++ // rather than load to determine QPU. This has obvious dire
++ // performance implications but (a) it is better than crashing
++ // and (b) it should (almost) never happen
++ const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
++ const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
++
++ if (tload < load)
++ {
++ yp = ypt;
++ load = tload;
++ }
++ }
++
++ yp->load += load_val;
++ ipe->used_grp = 1;
++ qpu_mc_link_set(yp->qpu_mc_curr, fn);
++
++ return yp;
++}
++
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++ for (unsigned int i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
++
++ qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
++ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
++ }
++}
++
++// Returns 0 on success
++// We no longer check for Q fullness as wew have emergncy code in ctu alloc
++// * However it might be an idea to have some means of spotting that we've used it
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++ if (!ipe->used_grp)
++ return 0;
++
++ if ((ipe->curr += ipe->n_grp) >= ipe->n)
++ {
++ ipe->curr = 0;
++ rpi_inter_pred_sync(ipe);
++ }
++ ipe->used = 1;
++ ipe->used_grp = 0;
++
++ return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++
++ ipe->curr = 0;
++ ipe->used = 0;
++ ipe->used_grp = 0;
++ for (i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ q->qpu_mc_curr = q->qpu_mc_base;
++ q->load = 0;
++ q->last_l0 = NULL;
++ q->last_l1 = NULL;
++ }
++}
++
++static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++ const unsigned int n_max, const unsigned int n_grp,
++ const unsigned int total_size, const unsigned int min_gap)
++{
++ int rv;
++
++ memset(ipe, 0, sizeof(*ipe));
++ if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
++ return AVERROR(ENOMEM);
++
++ ipe->n_grp = n_grp;
++ ipe->min_gap = min_gap;
++
++ if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
++ av_freep(&ipe->q);
++ return rv;
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline uint32_t pack_wo_p(const int off, const int mul)
++{
++ return PACK2(off * 2 + 1, mul);
++}
++
++static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
++{
++ return PACK2(off0 + off1 + 1, mul);
++}
++
++
++static void
++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++ const int x0, const int y0,
++ const int nPbW, const int nPbH,
++ const MvXY mv_xy,
++ const int weight_mul,
++ const int weight_offset,
++ AVFrame *const src_frame)
++{
++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++ const unsigned int mx = MV_X(mv_xy) & 3;
++ const unsigned int my = MV_Y(mv_xy) & 3;
++ const unsigned int my_mx = (my << 8) | mx;
++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx;
++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++ const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++
++ if (my_mx == 0)
++ {
++ const int x1 = x0 + (MV_X(mv_xy) >> 2);
++ const int y1 = y0 + (MV_Y(mv_xy) >> 2);
++ const int bh = nPbH;
++
++ for (int start_x = 0; start_x < nPbW; start_x += 16)
++ {
++ const int bw = FFMIN(nPbW - start_x, 16);
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ ++ts->y_pred1_x0y0;
++
++ if (nPbW > 8)
++ ++ts->y_pred1_wgt8;
++ else
++ ++ts->y_pred1_wle8;
++
++ if (nPbH > 16)
++ ++ts->y_pred1_hgt16;
++ else
++ ++ts->y_pred1_hle16;
++ }
++#endif
++
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src_vc_address_y;
++ cmd_y->w = bw;
++ cmd_y->h = bh;
++ cmd_y->wo1 = wo;
++ cmd_y->dst_addr = dst_addr + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++ else
++ {
++ const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
++ const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
++ const unsigned int bh = nPbH;
++ int start_x = 0;
++
++#if 1
++ // As Y-pred operates on two independant 8-wide src blocks we can merge
++ // this pred with the previous one if it the previous one is 8 pel wide,
++ // the same height as the current block, immediately to the left of our
++ // current dest block and mono-pred.
++
++ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++ {
++ const int bw = FFMIN(nPbW, 8);
++ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
++
++ last_y8_src2->x = x1_m3;
++ last_y8_src2->y = y1_m3;
++ last_y8_src2->base = src_vc_address_y;
++ last_y8_p->w += bw;
++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++ last_y8_p->wo2 = wo;
++
++ jb->last_y8_p = NULL;
++ jb->last_y8_l1 = NULL;
++ start_x = bw;
++#if RPI_TSTATS
++ ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
++#endif
++ }
++#endif
++
++ for (; start_x < nPbW; start_x += 16)
++ {
++ const int bw = FFMIN(nPbW - start_x, 16);
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ if (mx == 0 && my == 0)
++ ++ts->y_pred1_x0y0;
++ else if (mx == 0)
++ ++ts->y_pred1_x0;
++ else if (my == 0)
++ ++ts->y_pred1_y0;
++ else
++ ++ts->y_pred1_xy;
++
++ if (nPbW > 8)
++ ++ts->y_pred1_wgt8;
++ else
++ ++ts->y_pred1_wle8;
++
++ if (nPbH > 16)
++ ++ts->y_pred1_hgt16;
++ else
++ ++ts->y_pred1_hle16;
++ }
++#endif
++ src1->x = x1_m3 + start_x;
++ src1->y = y1_m3;
++ src1->base = src_vc_address_y;
++ if (bw <= 8)
++ {
++ src2->x = MC_DUMMY_X;
++ src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++ src2->base = s->qpu_dummy_frame_emu;
++#else
++ src2->base = s->qpu_dummy_frame_qpu;
++#endif
++ }
++ else
++ {
++ src2->x = x1_m3 + start_x + 8;
++ src2->y = y1_m3;
++ src2->base = src_vc_address_y;
++ }
++ cmd_y->w = bw;
++ cmd_y->h = bh;
++ cmd_y->mymx21 = my2_mx2_my_mx;
++ cmd_y->wo1 = wo;
++ cmd_y->wo2 = wo;
++ cmd_y->dst_addr = dst_addr + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++ if (bw == 8) {
++ jb->last_y8_l1 = src2;
++ jb->last_y8_p = cmd_y;
++ }
++ }
++ }
++}
++
++static void
++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const int x0, const int y0,
++ const int nPbW, const int nPbH,
++ const struct HEVCRpiMvField *const mv_field,
++ const AVFrame *const src_frame,
++ const AVFrame *const src_frame2)
++{
++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++ const MvXY mv = mv_field->xy[0];
++ const MvXY mv2 = mv_field->xy[1];
++
++ const unsigned int mx = MV_X(mv) & 3;
++ const unsigned int my = MV_Y(mv) & 3;
++ const unsigned int my_mx = (my<<8) | mx;
++ const unsigned int mx2 = MV_X(mv2) & 3;
++ const unsigned int my2 = MV_Y(mv2) & 3;
++ const unsigned int my2_mx2 = (my2<<8) | mx2;
++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++ const unsigned int ref_idx0 = mv_field->ref_idx[0];
++ const unsigned int ref_idx1 = mv_field->ref_idx[1];
++ const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
++ const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
++
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++
++ if (my2_mx2_my_mx == 0)
++ {
++ const int x1 = x0 + (MV_X(mv) >> 2);
++ const int y1 = y0 + (MV_Y(mv) >> 2);
++ const int x2 = x0 + (MV_X(mv2) >> 2);
++ const int y2 = y0 + (MV_Y(mv2) >> 2);
++ const int bh = nPbH;
++
++ // Can do chunks a full 16 wide if we don't want the H filter
++ for (int start_x=0; start_x < nPbW; start_x += 16)
++ {
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ ++ts->y_pred2_x0y0;
++
++ if (nPbH > 16)
++ ++ts->y_pred2_hgt16;
++ else
++ ++ts->y_pred2_hle16;
++ }
++#endif
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src1_base;
++ src2->x = x2 + start_x;
++ src2->y = y2;
++ src2->base = src2_base;
++ cmd_y->w = FFMIN(nPbW - start_x, 16);
++ cmd_y->h = bh;
++ cmd_y->mymx21 = 0;
++ cmd_y->wo1 = wo1;
++ cmd_y->wo2 = wo2;
++ cmd_y->dst_addr = dst + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++ else
++ {
++ // Filter requires a run-up of 3
++ const int x1 = x0 + (MV_X(mv) >> 2) - 3;
++ const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
++ const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
++ const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
++ const int bh = nPbH;
++
++ for (int start_x=0; start_x < nPbW; start_x += 8)
++ { // B blocks work 8 at a time
++ // B weights aren't doubled as the QPU code does the same
++ // amount of work as it does for P
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ const unsigned int mmx = mx | mx2;
++ const unsigned int mmy = my | my2;
++ if (mmx == 0 && mmy == 0)
++ ++ts->y_pred2_x0y0;
++ else if (mmx == 0)
++ ++ts->y_pred2_x0;
++ else if (mmy == 0)
++ ++ts->y_pred2_y0;
++ else
++ ++ts->y_pred2_xy;
++
++ if (nPbH > 16)
++ ++ts->y_pred2_hgt16;
++ else
++ ++ts->y_pred2_hle16;
++ }
++#endif
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src1_base;
++ src2->x = x2 + start_x;
++ src2->y = y2;
++ src2->base = src2_base;
++ cmd_y->w = FFMIN(nPbW - start_x, 8);
++ cmd_y->h = bh;
++ cmd_y->mymx21 = my2_mx2_my_mx;
++ cmd_y->wo1 = wo1;
++ cmd_y->wo2 = wo2;
++ cmd_y->dst_addr = dst + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const unsigned int lx, const int x0_c, const int y0_c,
++ const int nPbW_c, const int nPbH_c,
++ const MvXY mv,
++ const int16_t * const c_weights,
++ const int16_t * const c_offsets,
++ AVFrame * const src_frame)
++{
++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++ const int hshift = 1; // = s->ps.sps->hshift[1];
++ const int vshift = 1; // = s->ps.sps->vshift[1];
++
++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
++ const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
++ const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++ const unsigned int bh = nPbH_c;
++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
++
++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++ {
++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++ qpu_mc_src_t * const last_lx = *plast_lx;
++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++ last_lx->x = x1_c + start_x;
++ last_lx->y = y1_c;
++ last_lx->base = src_base_u;
++ cmd_c->h = bh;
++ cmd_c->w = bw;
++ cmd_c->coeffs_x = x_coeffs;
++ cmd_c->coeffs_y = y_coeffs;
++ cmd_c->wo_u = wo_u;
++ cmd_c->wo_v = wo_v;
++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++ *plast_lx = &cmd_c->next_src;
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
++ }
++ return;
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const int x0_c, const int y0_c,
++ const int nPbW_c, const int nPbH_c,
++ const struct HEVCRpiMvField * const mv_field,
++ const int16_t * const c_weights,
++ const int16_t * const c_offsets,
++ const int16_t * const c_weights2,
++ const int16_t * const c_offsets2,
++ AVFrame * const src_frame,
++ AVFrame * const src_frame2)
++{
++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++ const int hshift = 1; // s->ps.sps->hshift[1];
++ const int vshift = 1; // s->ps.sps->vshift[1];
++ const MvXY mv = mv_field->xy[0];
++ const MvXY mv2 = mv_field->xy[1];
++
++ const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
++ const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++
++ const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
++ const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++ const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
++ const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
++
++ const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
++ const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
++
++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++ const unsigned int bh = nPbH_c;
++
++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
++ {
++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++ qpu_mc_src_t * const src_l0 = cp->last_l0;
++ qpu_mc_src_t * const src_l1 = cp->last_l1;
++
++ src_l0->x = x1_c + start_x;
++ src_l0->y = y1_c;
++ src_l0->base = src1_base;
++ src_l1->x = x2_c + start_x;
++ src_l1->y = y2_c;
++ src_l1->base = src2_base;
++
++ u[0].h = bh;
++ u[0].w = bw;
++ u[0].coeffs_x1 = coefs0_x;
++ u[0].coeffs_y1 = coefs0_y;
++ u[0].weight_u1 = c_weights[0]; // Weight L0 U
++ u[0].weight_v1 = c_weights[1]; // Weight L0 V
++ u[0].coeffs_x2 = coefs1_x;
++ u[0].coeffs_y2 = coefs1_y;
++ u[0].wo_u2 = wo_u2;
++ u[0].wo_v2 = wo_v2;
++ u[0].dst_addr_c = dst_base_u + (start_x << xshl);
++
++ cp->last_l0 = &u[0].next_src1;
++ cp->last_l1 = &u[0].next_src2;
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++ }
++}
++
++
++static inline void
++col_stash(const HEVCRpiContext * const s,
++ const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
++ const HEVCRpiMvField * const mvf)
++{
++ ColMvField * const col_mvf = s->ref->col_mvf;
++ const unsigned int x = (x0 + 15) >> 4;
++ const unsigned int y = (y0 + 15) >> 4;
++ const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
++ const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
++
++ if (col_mvf != NULL && w != 0 && h != 0)
++ {
++ // Only record MV from the top left of the 16x16 block
++
++ const RefPicList * const rpl = s->refPicList;
++ const ColMvField cmv = {
++ .L = {
++ {
++ .poc = (mvf->pred_flag & PF_L0) == 0 ?
++ COL_POC_INTRA :
++ COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
++ .xy = mvf->xy[0]
++ },
++ {
++ .poc = (mvf->pred_flag & PF_L1) == 0 ?
++ COL_POC_INTRA :
++ COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
++ .xy = mvf->xy[1]
++ }
++ }
++ };
++
++ ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
++ const unsigned int stride = s->col_mvf_stride - w;
++ unsigned int j = h;
++
++ do
++ {
++ unsigned int k = w;
++ do
++ {
++ *p++ = cmv;
++ } while (--k != 0);
++ p += stride;
++ } while (--j != 0);
++ }
++}
++
++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++{
++ HEVCRpiJob * const jb = lc->jb0;
++
++ struct HEVCRpiMvField current_mv = {{0}};
++ const RefPicList *const refPicList = s->refPicList;
++ const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
++
++ if (lc->cu.pred_mode != MODE_SKIP)
++ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
++
++ if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
++ const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
++ ff_hevc_rpi_merge_idx_decode(s, lc);
++
++ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ partIdx, merge_idx, &current_mv);
++ } else {
++ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
++ }
++
++ {
++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++ unsigned int i, j;
++
++ for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
++ {
++ for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
++ p[i] = current_mv;
++ p += MVF_STASH_WIDTH_PU;
++ }
++ }
++
++ col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
++
++ if (current_mv.pred_flag & PF_L0) {
++ ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
++ if (!ref0)
++ return;
++ hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
++ }
++ if (current_mv.pred_flag & PF_L1) {
++ ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
++ if (!ref1)
++ return;
++ hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
++ }
++
++ if (current_mv.pred_flag == PF_L0) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++ ref0->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++ ref0->frame);
++ return;
++ }
++ } else if (current_mv.pred_flag == PF_L1) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++ ref1->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++ ref1->frame);
++ return;
++ }
++ } else if (current_mv.pred_flag == PF_BI) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
++ &current_mv,
++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++ ref0->frame,
++ ref1->frame);
++ return;
++ }
++ }
++}
++
++static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size,
++ const unsigned int ipm)
++{
++ const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
++ const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
++
++ {
++ const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
++ set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
++ }
++
++ // If IRAP then everything is Intra & we avoid ever looking at these
++ // stashes so don't bother setting them
++ if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
++ {
++ if (s->is_intra != NULL)
++ {
++ set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
++ }
++
++ {
++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++ const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
++ unsigned int n = size_in_pus;
++
++ do
++ {
++ memset(p, 0, size_in_pus * sizeof(*p));
++ p += MVF_STASH_WIDTH_PU;
++ } while (--n != 0);
++ }
++
++
++ if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
++ {
++ // Only record top left stuff
++ // Blocks should always be alinged on size boundries
++ // so cannot have overflow from a small block
++
++ ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
++ const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
++ const unsigned int stride = s->col_mvf_stride - size_in_col;
++ unsigned int j = size_in_col;
++
++ do
++ {
++ unsigned int k = size_in_col;
++ do
++ {
++ p->L[0].poc = COL_POC_INTRA;
++ p->L[0].xy = 0;
++ p->L[1].poc = COL_POC_INTRA;
++ p->L[1].xy = 0;
++ ++p;
++ } while (--k != 0);
++ p += stride;
++ } while (--j != 0);
++ }
++ }
++}
++
++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size)
++{
++ set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
++}
++
++
++/**
++ * 8.4.1
++ */
++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ int x0, int y0, int log2_pu_size,
++ int prev_intra_luma_pred_flag,
++ const unsigned int idx)
++{
++ const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
++ const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++ const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++
++ // Up does not cross boundries so as we always scan 1 slice-tile-line in an
++ // lc we can just keep 1 CTB lR stashes
++ // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
++ const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
++ const unsigned int cand_left = lc->ipm_left[yb_pu];
++
++ unsigned int intra_pred_mode;
++ unsigned int a, b, c;
++
++ if (cand_left == cand_up) {
++ if (cand_left < 2) {
++ a = INTRA_PLANAR;
++ b = INTRA_DC;
++ c = INTRA_ANGULAR_26;
++ } else {
++ a = cand_left;
++ b = 2 + ((cand_left - 2 - 1 + 32) & 31);
++ c = 2 + ((cand_left - 2 + 1) & 31);
++ }
++ } else {
++ a = cand_left;
++ b = cand_up;
++ c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
++ INTRA_PLANAR :
++ (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
++ INTRA_DC :
++ INTRA_ANGULAR_26;
++ }
++
++ if (prev_intra_luma_pred_flag) {
++ intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
++ } else {
++ // Sort lowest 1st
++ if (a > b)
++ FFSWAP(int, a, b);
++ if (a > c)
++ FFSWAP(int, a, c);
++ if (b > c)
++ FFSWAP(int, b, c);
++
++ intra_pred_mode = idx;
++ if (intra_pred_mode >= a)
++ intra_pred_mode++;
++ if (intra_pred_mode >= b)
++ intra_pred_mode++;
++ if (intra_pred_mode >= c)
++ intra_pred_mode++;
++ }
++
++ /* write the intra prediction units into the mv array */
++ set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
++ return intra_pred_mode;
++}
++
++static const uint8_t tab_mode_idx[] = {
++ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20,
++ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
++
++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size)
++{
++ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
++ uint8_t prev_intra_luma_pred_flag[4];
++ int split = lc->cu.part_mode == PART_NxN;
++ const unsigned int split_size = (1 << (log2_cb_size - 1));
++ int chroma_mode;
++ const unsigned int n = split ? 4 : 1;
++ unsigned int i;
++
++ for (i = 0; i != n; i++)
++ prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
++
++ for (i = 0; i < n; i++) {
++ // depending on mode idx is mpm or luma_pred_mode
++ const unsigned int idx = prev_intra_luma_pred_flag[i] ?
++ ff_hevc_rpi_mpm_idx_decode(lc) :
++ ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
++
++ lc->pu.intra_pred_mode[i] =
++ luma_intra_pred_mode(s, lc,
++ x0 + ((i & 1) == 0 ? 0 : split_size),
++ y0 + ((i & 2) == 0 ? 0 : split_size),
++ log2_cb_size - split,
++ prev_intra_luma_pred_flag[i], idx);
++ }
++
++ if (ctx_cfmt(s) == 3) {
++ for (i = 0; i < n; i++) {
++ lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
++ lc->pu.intra_pred_mode_c[i] = 34;
++ else
++ lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
++ } else {
++ lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
++ }
++ }
++ } else if (ctx_cfmt(s) == 2) {
++ int mode_idx;
++ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++ mode_idx = 34;
++ else
++ mode_idx = intra_chroma_table[chroma_mode];
++ } else {
++ mode_idx = lc->pu.intra_pred_mode[0];
++ }
++ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
++ } else if (ctx_cfmt(s) != 0) {
++ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++ lc->pu.intra_pred_mode_c[0] = 34;
++ else
++ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
++ } else {
++ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
++ }
++ }
++}
++
++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
++{
++ const unsigned int cb_size = 1 << log2_cb_size;
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const unsigned int min_cb_width = s->ps.sps->min_cb_width;
++ const unsigned int x_cb = x0 >> log2_min_cb_size;
++ const unsigned int y_cb = y0 >> log2_min_cb_size;
++ const unsigned int idx = log2_cb_size - 2;
++ const unsigned int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++ int skip_flag = 0;
++
++ lc->cu.x = x0;
++ lc->cu.y = y0;
++ lc->cu.x_split = x0;
++ lc->cu.y_split = y0;
++
++ lc->cu.pred_mode = MODE_INTRA;
++ lc->cu.part_mode = PART_2Nx2N;
++ lc->cu.intra_split_flag = 0;
++ lc->cu.cu_transquant_bypass_flag = 0;
++ lc->pu.intra_pred_mode[0] = 1;
++ lc->pu.intra_pred_mode[1] = 1;
++ lc->pu.intra_pred_mode[2] = 1;
++ lc->pu.intra_pred_mode[3] = 1;
++
++ if (s->ps.pps->transquant_bypass_enable_flag) {
++ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
++ if (lc->cu.cu_transquant_bypass_flag)
++ set_deblocking_bypass(s, x0, y0, log2_cb_size);
++ }
++
++ if (s->sh.slice_type != HEVC_SLICE_I) {
++ lc->cu.pred_mode = MODE_INTER;
++ skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
++ }
++
++ if (skip_flag) {
++ lc->cu.pred_mode = MODE_SKIP;
++
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++ } else {
++ int pcm_flag = 0;
++
++ if (s->sh.slice_type != HEVC_SLICE_I)
++ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
++ if (lc->cu.pred_mode != MODE_INTRA ||
++ log2_cb_size == s->ps.sps->log2_min_cb_size) {
++ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
++ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
++ lc->cu.pred_mode == MODE_INTRA;
++ }
++
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ if (lc->cu.part_mode == PART_2Nx2N &&
++ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size && // 0 if not enabled
++ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
++ ff_hevc_rpi_pcm_flag_decode(lc) != 0)
++ {
++ int ret;
++ pcm_flag = 1;
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++ if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
++ return ret;
++
++ if (s->ps.sps->pcm.loop_filter_disable_flag)
++ set_deblocking_bypass(s, x0, y0, log2_cb_size);
++ } else {
++ intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
++ }
++ } else {
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++ switch (lc->cu.part_mode) {
++ case PART_2Nx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++ break;
++ case PART_2NxN:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx);
++ lc->cu.y_split = y0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
++ break;
++ case PART_Nx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
++ lc->cu.x_split = x0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
++ break;
++ case PART_2NxnU:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx);
++ lc->cu.y_split = y0 + cb_size / 4;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
++ break;
++ case PART_2NxnD:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
++ lc->cu.y_split = y0 + cb_size / 4 * 3;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size / 4, log2_cb_size, 1, idx);
++ break;
++ case PART_nLx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2);
++ lc->cu.x_split = x0 + cb_size / 4;
++ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
++ break;
++ case PART_nRx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
++ lc->cu.x_split = x0 + cb_size / 4 * 3;
++ hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2);
++ break;
++ case PART_NxN:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
++ lc->cu.x_split = x0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
++ lc->cu.y_split = y0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
++ break;
++ }
++ }
++
++ if (!pcm_flag) {
++ int rqt_root_cbf = 1;
++
++ if (lc->cu.pred_mode != MODE_INTRA &&
++ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
++ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
++ }
++ if (rqt_root_cbf) {
++ const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
++ int ret;
++
++ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
++ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
++ s->ps.sps->max_transform_hierarchy_depth_inter;
++ // transform_tree does deblock_boundary_strengths
++ ret = hls_transform_tree(s, lc, x0, y0,
++ log2_cb_size, 0, 0, cbf_c);
++ if (ret < 0)
++ return ret;
++ } else {
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++ }
++ }
++ }
++
++ // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
++ if (lc->tu.is_cu_qp_delta_wanted)
++ ff_hevc_rpi_set_qPy(s, lc, x0, y0);
++
++ if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++ ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
++ lc->qPy_pred = lc->qp_y;
++ }
++
++ set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
++
++ set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
++
++ return 0;
++}
++
++// Returns:
++// < 0 Error
++// 0 More data wanted
++// 1 EoSlice / EoPicture
++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++ const int log2_cb_size, const unsigned int cb_depth)
++{
++ const int cb_size = 1 << log2_cb_size;
++ int ret;
++ int split_cu;
++
++ lc->ct_depth = cb_depth;
++ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
++ if (x0 + cb_size <= s->ps.sps->width &&
++ y0 + cb_size <= s->ps.sps->height &&
++ split_cu)
++ {
++ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
++ }
++
++ // Qp delta (and offset) need to remain wanted if cb_size < min until
++ // a coded block is found so we still initial state at depth 0 (outside
++ // this fn) and only reset here
++ if (s->ps.pps->cu_qp_delta_enabled_flag &&
++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++ {
++ lc->tu.is_cu_qp_delta_wanted = 1;
++ lc->tu.cu_qp_delta = 0;
++ }
++ if (s->sh.cu_chroma_qp_offset_enabled_flag &&
++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++ {
++ lc->tu.cu_chroma_qp_offset_wanted = 1;
++ }
++
++ lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
++ lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
++ lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
++
++ if (split_cu) {
++ int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++ const int cb_size_split = cb_size >> 1;
++ const int x1 = x0 + cb_size_split;
++ const int y1 = y0 + cb_size_split;
++
++ int more_data = 0;
++
++ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++
++ if (more_data && x1 < s->ps.sps->width) {
++ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++ if (more_data && y1 < s->ps.sps->height) {
++ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++ if (more_data && x1 < s->ps.sps->width &&
++ y1 < s->ps.sps->height) {
++ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++
++ if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++ ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
++ lc->qPy_pred = lc->qp_y;
++
++ if (more_data)
++ return ((x1 + cb_size_split) < s->ps.sps->width ||
++ (y1 + cb_size_split) < s->ps.sps->height);
++ else
++ return 0;
++ } else {
++ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
++ if (ret < 0)
++ return ret;
++ if ((!((x0 + cb_size) %
++ (1 << (s->ps.sps->log2_ctb_size))) ||
++ (x0 + cb_size >= s->ps.sps->width)) &&
++ (!((y0 + cb_size) %
++ (1 << (s->ps.sps->log2_ctb_size))) ||
++ (y0 + cb_size >= s->ps.sps->height))) {
++ int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++ return !end_of_slice_flag;
++ } else {
++ return 1;
++ }
++ }
++
++ return 0; // NEVER
++}
++
++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x_ctb, const int y_ctb, const int ctb_addr_ts)
++{
++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++ const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice
++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++ const unsigned int line_w = s->ps.sps->ctb_width;
++
++ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
++
++ lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
++ lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++
++ lc->boundary_flags = 0;
++
++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
++ lc->boundary_flags |= BOUNDARY_LEFT_TILE;
++ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
++ lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
++ if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
++ lc->boundary_flags |= BOUNDARY_UPPER_TILE;
++ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
++ lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
++
++ // Use line width rather than tile width for addr_in_slice test as
++ // addr_in_slice is in raster units
++
++ lc->ctb_avail =
++ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
++ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
++ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
++ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
++ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
++ // Down-left never avail at CTB level
++}
++
++
++static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
++ (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
++
++ // Signal
++ if (y > 0) {
++ // Cast away const as progress is held in s, but this really shouldn't confuse anything
++ ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
++ }
++
++ // Job done now
++ // ? Move outside this fn
++ job_free(s->jbc, jb);
++}
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ unsigned int i;
++ HEVCRpiIntraPredEnv * const iap = &jb->intra;
++ const HEVCPredCmd *cmd = iap->cmds;
++
++#if !RPI_WORKER_WAIT_PASS_0
++ rpi_sem_wait(&jb->sem);
++ rpi_cache_flush_execute(jb->rfe); // Invalidate data set up in pass1
++#endif
++
++ for (i = iap->n; i > 0; i--, cmd++)
++ {
++ switch (cmd->type)
++ {
++ case RPI_PRED_INTRA:
++ s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++ break;
++ case RPI_PRED_INTRA_C:
++ s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++ break;
++ case RPI_PRED_ADD_RESIDUAL:
++ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++ break;
++ case RPI_PRED_ADD_DC:
++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_U:
++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_V:
++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_C:
++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++ break;
++ case RPI_PRED_ADD_DC_U:
++ case RPI_PRED_ADD_DC_V:
++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++ break;
++
++ case RPI_PRED_I_PCM:
++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++ break;
++
++ default:
++ av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++ abort();
++ }
++ }
++
++ // Mark done
++ iap->n = 0;
++}
++
++
++// Set initial uniform job values & zero ctu_count
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
++{
++ unsigned int i;
++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
++ const HEVCRpiSPS * const sps = s->ps.sps;
++
++ const uint16_t pic_width_y = sps->width;
++ const uint16_t pic_height_y = sps->height;
++
++ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1);
++ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1);
++
++ // We expect the pointer to change if we use another sps
++ if (sps != jb->sps)
++ {
++ worker_pic_free_one(jb);
++
++ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
++ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
++
++ {
++ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
++ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
++ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
++ }
++
++ jb->sps = sps;
++ }
++
++ jb->waited = 0;
++ jb->ctu_ts_first = ctu_ts_first;
++ jb->ctu_ts_last = -1;
++
++ rpi_inter_pred_reset(cipe);
++ for (i = 0; i < cipe->n; i++) {
++ HEVCRpiInterPredQ * const cp = cipe->q + i;
++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++
++ u->next_src1.x = 0;
++ u->next_src1.y = 0;
++ u->next_src1.base = 0;
++ u->pic_cw = pic_width_c;
++ u->pic_ch = pic_height_c;
++ u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++ u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++ cp->last_l0 = &u->next_src1;
++
++ u->next_fn = 0;
++ u->next_src2.x = 0;
++ u->next_src2.y = 0;
++ u->next_src2.base = 0;
++ cp->last_l1 = &u->next_src2;
++
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++ }
++
++ rpi_inter_pred_reset(yipe);
++ for (i = 0; i < yipe->n; i++) {
++ HEVCRpiInterPredQ * const yp = yipe->q + i;
++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++
++ y->next_src1.x = 0;
++ y->next_src1.y = 0;
++ y->next_src1.base = 0;
++ y->next_src2.x = 0;
++ y->next_src2.y = 0;
++ y->next_src2.base = 0;
++ y->pic_h = pic_height_y;
++ y->pic_w = pic_width_y;
++ y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++ y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++ y->next_fn = 0;
++ yp->last_l0 = &y->next_src1;
++ yp->last_l1 = &y->next_src2;
++
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
++ }
++
++ jb->last_y8_p = NULL;
++ jb->last_y8_l1 = NULL;
++
++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++ jb->progress_req[i] = -1;
++ }
++
++ worker_pic_reset(&jb->coeffs);
++}
++
++
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
++ const vpu_qpu_job_h vqj,
++ rpi_cache_flush_env_t * const rfe,
++ HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++ unsigned int max_block = 0;
++
++ if (!ipe->used) {
++ return 0;
++ }
++
++ if (ipe->curr != 0) {
++ rpi_inter_pred_sync(ipe);
++ }
++
++ // Add final commands to Q
++ for(i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const yp = ipe->q + i;
++ qpu_mc_src_t *const p0 = yp->last_l0;
++ qpu_mc_src_t *const p1 = yp->last_l1;
++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++ if (block_size > max_block)
++ max_block = block_size;
++
++ qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
++
++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++ p0->x = MC_DUMMY_X;
++ p0->y = MC_DUMMY_Y;
++ p0->base = s->qpu_dummy_frame_qpu;
++ p1->x = MC_DUMMY_X;
++ p1->y = MC_DUMMY_Y;
++ p1->base = s->qpu_dummy_frame_qpu;
++
++ yp->last_l0 = NULL;
++ yp->last_l1 = NULL;
++
++ // Add to mailbox list
++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++ mail[i][1] = yp->code_setup;
++ }
++
++ // We don't need invalidate here as the uniforms aren't changed by the QPU
++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++ // new values which seems to give us a small performance advantage
++ //
++ // In most cases we will not have a completely packed set of uniforms and as
++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++ // fullest
++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++ ipe->n, ipe->max_fill + ipe->min_gap);
++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++ return 1;
++}
++#endif
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
++ const vpu_qpu_job_h vqj,
++ rpi_cache_flush_env_t * const rfe,
++ HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++ if (!ipe->used) {
++ return 0;
++ }
++
++ if (ipe->curr != 0) {
++ rpi_inter_pred_sync(ipe);
++ }
++
++ // Add final commands to Q
++ for(i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const yp = ipe->q + i;
++ qpu_mc_src_t *const p0 = yp->last_l0;
++ qpu_mc_src_t *const p1 = yp->last_l1;
++
++ yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++ p0->x = MC_DUMMY_X;
++ p0->y = MC_DUMMY_Y;
++ p0->base = s->qpu_dummy_frame_emu;
++ p1->x = MC_DUMMY_X;
++ p1->y = MC_DUMMY_Y;
++ p1->base = s->qpu_dummy_frame_emu;
++
++ yp->last_l0 = NULL;
++ yp->last_l1 = NULL;
++ }
++
++ return 1;
++}
++#endif
++
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++
++
++static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
++{
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++ rpi_cache_flush_finish(rfe);
++}
++
++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
++ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
++ const unsigned int ctb_width = s->ps.sps->ctb_width;
++ RpiBlk *const bounds = &jb->bounds;
++ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
++ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
++ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
++ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
++ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
++
++ bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
++ bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
++}
++
++#if RPI_PASSES == 2
++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ // Perform intra prediction and residual reconstruction
++ rpi_execute_pred_cmds(s, jb);
++
++ // Perform deblocking for CTBs in this row
++ rpi_execute_dblk_cmds(s, jb);
++}
++#endif
++
++// Core execution tasks
++static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ int pred_y, pred_c;
++ vpu_qpu_job_env_t qvbuf;
++ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
++#if RPI_WORKER_WAIT_PASS_0
++ int do_wait;
++#endif
++
++ {
++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++ if (cf->s[3].n + cf->s[2].n != 0)
++ {
++ const unsigned int csize = sizeof(cf->s[3].buf[0]);
++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++ unsigned int n16 = (cf->s[2].n >> 8);
++ unsigned int n32 = (cf->s[3].n >> 10);
++#if RPI_COMPRESS_COEFFS
++ if (cf->s[2].packed) {
++ n16 = n16 | (n16<<16);
++ } else {
++ const unsigned int npack16 = (cf->s[2].packed_n>>8);
++ n16 = n16 | (npack16<<16);
++ }
++ if (cf->s[3].packed) {
++ n32 = n32 | (n32<<16);
++ } else {
++ const unsigned int npack32 = (cf->s[3].packed_n>>10);
++ n32 = n32 | (npack32<<16);
++ }
++#endif
++ vpu_qpu_job_add_vpu(vqj,
++ vpu_get_fn(s->ps.sps->bit_depth),
++ vpu_get_constants(),
++ cf->gptr.vc,
++ n16,
++ cf->gptr.vc + offset32,
++ n32,
++ 0);
++
++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
++ }
++ }
++
++ pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
++
++// We could take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++
++ pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
++
++ // Returns 0 if nothing to do, 1 if sync added
++#if RPI_WORKER_WAIT_PASS_0
++ do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
++#else
++ if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
++ sem_post(&jb->sem);
++#endif
++
++ rpi_cache_flush_execute(jb->rfe);
++
++ // Await progress as required
++ // jb->waited will only be clear if we have already tested the progress values
++ // (in worker_submit_job) and found we don't have to wait
++ if (jb->waited)
++ {
++ unsigned int i;
++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++ if (jb->progress_req[i] >= 0) {
++ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
++ }
++ }
++ }
++
++ vpu_qpu_job_finish(vqj);
++
++ // We always work on a rectangular block
++ if (pred_y || pred_c)
++ {
++ rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
++ ctx_vshift(s, 1), pred_y, pred_c);
++ }
++
++ // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ if (av_rpi_is_sand8_frame(s->frame))
++ {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++ }
++ else
++ {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
++ }
++#endif
++
++#if RPI_WORKER_WAIT_PASS_0
++ if (do_wait)
++ rpi_sem_wait(&jb->sem);
++ rpi_cache_flush_execute(jb->rfe);
++#endif
++}
++
++
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++ av_freep(&ipe->q);
++ gpu_free(&ipe->gptr);
++}
++
++static HEVCRpiJob * job_new(void)
++{
++ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
++
++ if (jb == NULL)
++ return NULL;
++
++ sem_init(&jb->sem, 0, 0);
++ jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++ jb->intra.n = 0;
++ if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
++ goto fail1;
++
++ // * Sizeof the union structure might be overkill but at the moment it
++ // is correct (it certainly isn't going to be too small)
++ // Set max fill to slack/2 from the end of the Q
++ // If we exceed this in any Q then we will schedule by size (which should
++ // mean that we never use that Q again part from syncs)
++ // * Given how agressive the overflow resonse is we could maybe put the
++ // threshold even nearer the end, but I don't expect us to ever hit
++ // it on any real stream anyway.
++
++ if (rpi_inter_pred_alloc(&jb->chroma_ip,
++ QPU_N_MAX, QPU_N_GRP,
++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
++ QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
++ goto fail2;
++ if (rpi_inter_pred_alloc(&jb->luma_ip,
++ QPU_N_MAX, QPU_N_GRP,
++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
++ QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
++ goto fail3;
++
++ return jb;
++
++fail3:
++ rpi_free_inter_pred(&jb->luma_ip);
++fail2:
++ av_freep(&jb->intra.cmds);
++fail1:
++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++ rpi_cache_flush_finish(jb->rfe);
++ sem_destroy(&jb->sem);
++ return NULL;
++}
++
++static void job_delete(HEVCRpiJob * const jb)
++{
++ worker_pic_free_one(jb);
++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++ rpi_free_inter_pred(&jb->chroma_ip);
++ rpi_free_inter_pred(&jb->luma_ip);
++ av_freep(&jb->intra.cmds);
++ rpi_cache_flush_finish(jb->rfe); // Not really needed - should do nothing
++ sem_destroy(&jb->sem);
++ av_free(jb);
++}
++
++static void jbg_delete(HEVCRpiJobGlobal * const jbg)
++{
++ HEVCRpiJob * jb;
++
++ if (jbg == NULL)
++ return;
++
++ jb = jbg->free1;
++ while (jb != NULL)
++ {
++ HEVCRpiJob * const jb2 = jb;
++ jb = jb2->next;
++ job_delete(jb2);
++ }
++
++ pthread_mutex_destroy(&jbg->lock);
++ av_free(jbg);
++}
++
++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
++{
++ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
++ if (jbg == NULL)
++ return NULL;
++
++ pthread_mutex_init(&jbg->lock, NULL);
++
++ while (job_count-- != 0)
++ {
++ HEVCRpiJob * const jb = job_new();
++ if (jb == NULL)
++ goto fail;
++
++ jb->next = jbg->free1;
++ jbg->free1 = jb;
++ }
++
++ return jbg;
++
++fail:
++ jbg_delete(jbg);
++ return NULL;
++}
++
++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
++{
++ HEVCRpiJobGlobal * jbg;
++
++ if (jbc == NULL)
++ return;
++
++ jbg = jbc->jbg;
++
++ if (jbc->jb1 != NULL)
++ job_delete(jbc->jb1);
++
++ pthread_mutex_destroy(&jbc->in_lock);
++ sem_destroy(&jbc->sem_out);
++ av_free(jbc);
++
++ // Deref the global job context
++ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
++ jbg_delete(jbg);
++}
++
++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
++{
++ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
++
++ if (jbc == NULL)
++ return NULL;
++
++ jbc->jbg = jbg;
++ atomic_fetch_add(&jbg->ref_count, 1);
++
++ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
++ pthread_mutex_init(&jbc->in_lock, NULL);
++
++ if ((jbc->jb1 = job_new()) == NULL)
++ goto fail;
++ jbc->jb1->jbc_local = jbc;
++
++ return jbc;
++
++fail:
++ rpi_job_ctl_delete(jbc);
++ return NULL;
++}
++
++
++
++static av_cold void hevc_init_worker(HEVCRpiContext * const s)
++{
++#if RPI_PASSES == 2
++ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
++#elif RPI_PASSES == 3
++ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
++ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
++#else
++#error Passes confused
++#endif
++ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
++
++ pass_queues_start_all(s);
++}
++
++static av_cold void hevc_exit_worker(HEVCRpiContext *s)
++{
++ pass_queues_term_all(s);
++
++ pass_queues_kill_all(s);
++
++ rpi_job_ctl_delete(s->jbc);
++ s->jbc = NULL;
++}
++
++
++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
++{
++ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
++ const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
++
++ // Check for obvious disasters
++ if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
++ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // If dependant then ctb_addr_ts != 0 from previous check
++ if (s->sh.dependent_slice_segment_flag) {
++ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
++ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
++ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++ tile_id + s->sh.num_entry_point_offsets >= tiles)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // Tiled stuff must start at start of tile if it has multiple entry points
++ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->sh.num_entry_point_offsets != 0 &&
++ ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ ff_hevc_rpi_cabac_init_decoder(lc);
++
++ // Setup any required decode vars
++ lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
++
++// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
++ lc->qp_y = s->sh.slice_qp;
++
++ // General setup
++ lc->bt_line_no = 0;
++ lc->ts = ctb_addr_ts;
++ return 0;
++}
++
++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++ const GetBitContext * const gb = &s->HEVClc->gb;
++ RpiSliceHeader * const sh = &s->sh;
++ int i, j;
++
++ const unsigned int length = nal->size;
++ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte
++ unsigned int cmpt;
++ unsigned int startheader;
++
++ if (sh->num_entry_point_offsets == 0) {
++ s->data = NULL;
++ return 0;
++ }
++
++ // offset in slice header includes emulation prevention bytes.
++ // Unfortunately those have been removed by the time we get here so we
++ // have to compensate. The nal layer keeps a track of where they were.
++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++ startheader--;
++ cmpt++;
++ }
++ }
++
++ for (i = 1; i < sh->num_entry_point_offsets; i++) {
++ offset += (sh->entry_point_offset[i - 1] - cmpt);
++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++ startheader--;
++ cmpt++;
++ }
++ }
++ if (sh->entry_point_offset[i] <= cmpt) {
++ av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
++ return AVERROR_INVALIDDATA;
++ }
++ sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
++ sh->offset[i - 1] = offset;
++ }
++
++ offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
++ if (length < offset) {
++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++ return AVERROR_INVALIDDATA;
++ }
++ sh->size[sh->num_entry_point_offsets - 1] = length - offset;
++ sh->offset[sh->num_entry_point_offsets - 1] = offset;
++
++ // Remember data start pointer as we won't have nal later
++ s->data = nal->data;
++ return 0;
++}
++
++
++// Return
++// < 0 Error
++// 0 OK
++//
++// jb->ctu_ts_last < 0 Job still filling
++// jb->ctu_ts_last >= 0 Job ready
++
++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
++{
++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++ const unsigned int ctb_size = (1 << log2_ctb_size);
++ HEVCRpiJob * const jb = lc->jb0;
++ int more_data = 1;
++ unsigned int ctb_addr_ts = lc->ts;
++ unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
++ const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
++
++ lc->unit_done = 0;
++
++ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
++ {
++ int q_full;
++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++
++ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
++
++ ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
++
++ hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
++
++ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
++ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset;
++ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag;
++
++ // Zap stashes if navail
++ if ((lc->ctb_avail & AVAIL_U) == 0)
++ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
++ if ((lc->ctb_avail & AVAIL_L) == 0)
++ {
++ memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
++ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
++ }
++#if MVF_STASH_WIDTH > 64
++ // Restore left mvf stash at start of tile if not at start of line
++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
++ {
++ unsigned int i;
++ HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
++ const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++ {
++ *dst = *src++;
++ dst += MVF_STASH_WIDTH_PU;
++ }
++ }
++#endif
++
++ // Set initial tu states
++ lc->tu.cu_qp_delta = 0;
++ lc->tu.is_cu_qp_delta_wanted = 0;
++ lc->tu.cu_chroma_qp_offset_wanted = 0;
++
++ // Decode
++ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
++
++ if (ff_hevc_rpi_cabac_overflow(lc))
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
++ more_data = AVERROR_INVALIDDATA;
++ }
++
++ if (more_data < 0) {
++ s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken
++ return more_data;
++ }
++
++ if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
++ (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
++ {
++ if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
++ ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
++ return -1;
++ }
++ }
++
++ // --- Post CTB processing
++
++ // Stash rpl top/left for deblock that needs to remember such things cross-slice
++ s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
++ s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
++
++ if (!s->is_irap)
++ {
++ // Copy MVF up to up-left & stash to up
++ {
++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
++ HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
++
++ // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
++
++ lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
++ memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
++ }
++ // Stash sideways if end of tile line but not end of line (no point)
++ // ** Could/should do this @ end of fn
++#if MVF_STASH_WIDTH > 64
++ if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
++#endif
++ {
++ unsigned int i;
++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
++ HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++ {
++ *dst++ = *src;
++ src += MVF_STASH_WIDTH_PU;
++ }
++ }
++ }
++
++ if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
++ ff_hevc_rpi_save_states(s, lc);
++
++ // Report progress so we can use our MVs in other frames
++ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
++ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
++
++ // End of line || End of tile line || End of tile
++ // (EoL covers end of frame for our purposes here)
++ q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
++
++ // Allocate QPU chunks on fixed size 64 pel boundries rather than
++ // whatever ctb_size is today.
++ // * We might quite like to continue to 64 pel vertical too but that
++ // currently confuses WPP
++ if (((x_ctb + ctb_size) & 63) == 0 || q_full)
++ {
++ int overflow = 0;
++ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
++ overflow = 1;
++ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
++ overflow = 1;
++ if (overflow)
++ {
++ // * This is very annoying (and slow) to cope with in WPP so
++ // we treat it as an error there (no known stream triggers this
++ // with the current buffer sizes). Non-wpp should cope fine.
++ av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__);
++ q_full = 1;
++ }
++ }
++
++ // Inc TS to next.
++ ctb_addr_ts++;
++ ctb_addr_rs++;
++ x_ctb += ctb_size;
++
++ if (q_full)
++ {
++ // Do job
++ // Prep for submission
++ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced
++ job_gen_bounds(s, jb);
++ break;
++ }
++
++ // If max_blocks started as 0 then this will never be true
++ if (--max_blocks == 0)
++ break;
++ }
++
++ lc->unit_done = (more_data <= 0);
++ lc->ts = ctb_addr_ts;
++ return 0;
++}
++
++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
++{
++ lc->context = s;
++ lc->jb0 = NULL;
++ lc->lc_n = n;
++ lc->bt_terminate = 0;
++ lc->bt_psem_out = NULL;
++ sem_init(&lc->bt_sem_in, 0, 0);
++}
++
++#define TRACE_WPP 0
++#if RPI_EXTRA_BIT_THREADS > 0
++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
++{
++ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
++ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
++}
++
++// Move local context parameters from an aux bit thread back to the main
++// thread at the end of a slice as processing is going to continue there.
++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
++{
++ if (src_lc == dst_lc) {
++ return;
++ }
++
++ // Move the job
++ // We will still have an active job if the final line terminates early
++ // Dest should always be null by now
++ av_assert1(dst_lc->jb0 == NULL);
++ dst_lc->jb0 = src_lc->jb0;
++ src_lc->jb0 = NULL;
++
++ // Always need to store where we are in the bitstream
++ dst_lc->ts = src_lc->ts;
++ dst_lc->gb = src_lc->gb;
++ // Cabac init request will be built at start of next slice
++
++ // Need to store context if we might have a dependent seg
++ if (is_dep)
++ {
++ dst_lc->qPy_pred = src_lc->qPy_pred;
++ memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
++ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
++ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
++ }
++}
++
++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
++{
++ rpi_sem_wait(&lc->bt_sem_in);
++ return lc->bt_terminate;
++}
++
++// Do one WPP line
++// Will not work correctly over horizontal tile boundries - vertical should be OK
++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
++{
++ const int is_tile = lc->bt_is_tile;
++ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
++ const unsigned int line = lc->bt_line_no;
++ const unsigned int line_inc = lc->bt_line_inc;
++ const int is_last = (line >= lc->bt_last_line);
++
++ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
++ const unsigned int ts_next =
++ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
++ INT_MAX :
++ is_tile ?
++ s->ps.pps->tile_pos_ts[tile_id + line_inc] :
++ lc->ts + lc->bt_line_width * line_inc;
++ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
++ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
++ unsigned int ts_prev;
++ int loop_n = 0;
++ int err = 0;
++
++ av_assert1(line <= s->sh.num_entry_point_offsets);
++
++#if TRACE_WPP
++ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
++ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id,
++ line, lc->bt_last_line, s->sh.num_entry_point_offsets,
++ lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
++#endif
++ if (line != 0)
++ {
++ const uint8_t * const data = s->data + s->sh.offset[line - 1];
++ const unsigned int len = s->sh.size[line - 1];
++ if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
++ return err;
++
++ ff_init_cabac_decoder(&lc->cc, data, len);
++ }
++
++ // We should never be processing a dependent slice here so reset is good
++ // ?? These probably shouldn't be needed (as they should be set by later
++ // logic) but do seem to be required
++ lc->qp_y = s->sh.slice_qp;
++
++ do
++ {
++ if (!is_last && loop_n > 1) {
++#if TRACE_WPP
++ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ }
++ // The wait for loop_n == 0 has been done in bit_thread
++ if (!is_first && loop_n != 0)
++ {
++#if TRACE_WPP
++ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
++#endif
++ if (wait_bt_sem_in(lc) != 0)
++ return AVERROR_EXIT;
++ }
++
++#if TRACE_WPP
++ {
++ int n;
++ sem_getvalue(&lc->bt_sem_in, &n);
++ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
++ }
++#endif
++
++ ts_prev = lc->ts;
++
++ // If we have had an error - do no further decode but do continue
++ // moving signals around so the other threads continue to operate
++ // correctly (or at least as correctly as they can with this line missing)
++ //
++ // Errors in WPP/Tile are less fatal than normal as we have a good idea
++ // of how to restart on the next line so there is no need to give up totally
++ if (err != 0)
++ {
++ lc->unit_done = 0;
++ lc->ts += partial_size;
++ }
++ else
++ {
++ worker_pass0_ready(s, lc);
++
++ if ((err = fill_job(s, lc, partial_size)) < 0 ||
++ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
++ {
++ if (err == 0) {
++ av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
++ err = AVERROR_INVALIDDATA;
++ }
++ worker_free(s, lc);
++ lc->ts = ts_prev + partial_size; // Pretend we did all that
++ lc->unit_done = 0;
++ }
++ else if (is_tile)
++ {
++ worker_submit_job(s, lc);
++ }
++ }
++
++ ++loop_n;
++ } while (lc->ts < ts_eol && !lc->unit_done);
++
++ // If we are on the last line & we didn't get a whole line we must wait for
++ // and sink the sem_posts from the line above / tile to the left.
++ while ((ts_prev += partial_size) < ts_eol)
++ {
++#if TRACE_WPP
++ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
++#endif
++ if (wait_bt_sem_in(lc) != 0)
++ return AVERROR_EXIT;
++ }
++
++ lc->bt_line_no += line_inc;
++
++ if (!is_tile && err == 0)
++ worker_submit_job(s, lc);
++
++ if (!is_last) {
++ lc->ts = ts_next;
++
++#if TRACE_WPP
++ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ if (loop_n > 1) {
++#if TRACE_WPP
++ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ }
++ }
++ else
++ {
++ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT
++#if MVF_STASH_WIDTH > 64
++ // Horrid calculations to work out what we want but luckily this should almost never execute
++ // **** Move to movlc
++ if (!s->is_irap)
++ {
++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
++ if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
++ {
++ const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
++ unsigned int i;
++ const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++ HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++
++ for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
++ {
++ *d_mvf = *s_mvf;
++ d_mvf += MVF_STASH_WIDTH_PU;
++ s_mvf += MVF_STASH_WIDTH_PU;
++ }
++
++ }
++ }
++#endif
++ // When all done poke the thread 0 sem_in one final time
++#if TRACE_WPP
++ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
++#endif
++ sem_post(&s->HEVClcList[0]->bt_sem_in);
++ }
++
++#if TRACE_WPP
++ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
++#endif
++ return err;
++}
++
++static void wpp_setup_lcs(HEVCRpiContext * const s)
++{
++ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const unsigned int line_width = line_ts_width(s, ts);
++
++ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
++ {
++ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++ lc->ts = ts;
++ lc->bt_is_tile = 0;
++ lc->bt_line_no = i;
++ lc->bt_line_width = line_width;
++ lc->bt_last_line = s->sh.num_entry_point_offsets;
++ lc->bt_line_inc = RPI_BIT_THREADS;
++ ts += line_width;
++ }
++}
++
++
++// Can only process tile single row at once
++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
++{
++ const HEVCRpiPPS * const pps = s->ps.pps;
++ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const unsigned int tile0 = pps->tile_id[ts0];
++ const unsigned int col0 = tile0 % pps->num_tile_columns;
++
++ const unsigned int col = (slice_row == 0) ? col0 : 0;
++ unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
++ const unsigned int last_line = FFMIN(
++ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
++
++ const unsigned int par =
++ FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
++#if TRACE_WPP
++ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
++ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
++#endif
++ for (unsigned int i = 0; i != par; ++i, ++line)
++ {
++ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++ const unsigned int tile = tile0 + line;
++
++ lc->ts = pps->tile_pos_ts[tile];
++ lc->bt_line_no = line;
++ lc->bt_is_tile = 1;
++ lc->bt_line_width = line_ts_width(s, lc->ts);
++ lc->bt_last_line = last_line;
++ lc->bt_line_inc = par;
++ }
++}
++
++
++static void * bit_thread(void * v)
++{
++ HEVCRpiLocalContext * const lc = v;
++ HEVCRpiContext *const s = lc->context;
++
++ while (wait_bt_sem_in(lc) == 0)
++ {
++ int err;
++
++ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp
++ if (lc->bt_terminate) {
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
++ break;
++ }
++ av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
++ }
++ }
++
++ return NULL;
++}
++
++static int bit_threads_start(HEVCRpiContext * const s)
++{
++ if (s->bt_started)
++ return 0;
++
++ for (int i = 1; i < RPI_BIT_THREADS; ++i)
++ {
++ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
++ if (s->HEVClcList[i] == NULL) {
++ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
++ return -1;
++ }
++
++ bt_lc_init(s, s->HEVClcList[i], i);
++ job_lc_init(s->HEVClcList[i]);
++ }
++
++ // Link the sems in a circle
++ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
++ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
++ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
++
++ // Init all lc before starting any threads
++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++ {
++ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
++ return -1;
++ }
++
++ s->bt_started = 1;
++ return 0;
++}
++
++static int bit_threads_kill(HEVCRpiContext * const s)
++{
++ if (!s->bt_started)
++ return 0;
++ s->bt_started = 0;
++
++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++ {
++ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
++ if (lc == NULL)
++ break;
++
++ lc->bt_terminate = 1;
++ sem_post(&lc->bt_sem_in);
++ pthread_join(s->bit_threads[i], NULL);
++
++ sem_destroy(&lc->bt_sem_in);
++ job_lc_kill(lc);
++ }
++ return 0;
++}
++#endif
++
++
++// If we are at EoT and the row is shorter than the number of jobs
++// we can Q we have to wait for it finish otherwise we risk cache/QPU
++// disasters
++static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
++{
++ return
++ s->ps.pps->tile_wpp_inter_disable >= 2 &&
++ s->sh.slice_type != HEVC_SLICE_I &&
++ n >= 0 &&
++ (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
++}
++
++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++{
++ HEVCRpiContext * const s = avctxt->priv_data;
++ HEVCRpiLocalContext * const lc = s->HEVClc;
++ int err;
++
++ // Start of slice
++ if ((err = slice_start(s, lc)) != 0)
++ return err;
++
++#if RPI_EXTRA_BIT_THREADS > 0
++
++ if (s->sh.offload_tiles)
++ {
++ unsigned int slice_row = 0;
++
++#if TRACE_WPP
++ printf("%s: Do Tiles\n", __func__);
++#endif
++ // Generate & start extra bit threads if they aren't already running
++ bit_threads_start(s);
++
++ do
++ {
++ // Reset lc lines etc.
++ tile_one_row_setup_lcs(s, slice_row);
++
++#if TRACE_WPP
++ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
++#if TRACE_WPP
++ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++ while (lc->bt_line_no <= lc->bt_last_line) {
++ rpi_sem_wait(&lc->bt_sem_in);
++ rpi_run_one_line(s, lc, 0);
++ }
++#if TRACE_WPP
++ printf("%s: Done body\n", __func__);
++#endif
++
++ // Wait for everything else to finish
++ rpi_sem_wait(&lc->bt_sem_in);
++
++ ++slice_row;
++ } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
++
++
++#if TRACE_WPP
++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++ else if (s->sh.offload_wpp)
++ {
++#if TRACE_WPP
++ printf("%s: Do WPP\n", __func__);
++#endif
++ // Generate & start extra bit threads if they aren't already running
++ bit_threads_start(s);
++
++ // Reset lc lines etc.
++ wpp_setup_lcs(s);
++
++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
++#if TRACE_WPP
++ printf("%s: Done 1st\n", __func__);
++#endif
++
++ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
++ rpi_sem_wait(&lc->bt_sem_in);
++ rpi_run_one_line(s, lc, 0);
++ }
++#if TRACE_WPP
++ printf("%s: Done body\n", __func__);
++#endif
++
++ // Wait for everything else to finish
++ rpi_sem_wait(&lc->bt_sem_in);
++
++#if TRACE_WPP
++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++ else
++#endif
++ {
++#if TRACE_WPP
++ printf("%s: Single start: ts=%d\n", __func__, lc->ts);
++#endif
++ // Single bit thread
++ do {
++ // Make sure we have space to prepare the next job
++ worker_pass0_ready(s, lc);
++
++ if ((err = fill_job(s, lc, 0)) < 0)
++ goto fail;
++
++ worker_submit_job(s, lc);
++
++ if (tile_needs_wait(s, lc->ts - 1))
++ worker_wait(s, lc);
++
++ } while (!lc->unit_done);
++
++#if TRACE_WPP
++ printf("%s: Single end: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++
++ // If we have reached the end of the frame or
++ // then wait for the worker to finish all its jobs
++ if (lc->ts >= s->ps.sps->ctb_size)
++ worker_wait(s, lc);
++
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = &s->tstats;
++
++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++ ts->y_pred2_hgt16, ts->y_pred2_hle16);
++ memset(ts, 0, sizeof(*ts));
++ }
++#endif
++
++ return lc->ts;
++
++fail:
++ // Cleanup
++ av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
++ // Free our job & wait for temination
++ worker_free(s, lc);
++ worker_wait(s, lc);
++ return err;
++}
++
++
++static void set_no_backward_pred(HEVCRpiContext * const s)
++{
++ int i, j;
++ const RefPicList *const refPicList = s->refPicList;
++
++ s->no_backward_pred_flag = 0;
++ if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
++ return;
++
++ for (j = 0; j < 2; j++) {
++ for (i = 0; i < refPicList[j].nb_refs; i++) {
++ if (refPicList[j].list[i] > s->poc) {
++ s->no_backward_pred_flag = 1;
++ return;
++ }
++ }
++ }
++}
++
++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++ int err;
++ if ((err = gen_entry_points(s, nal)) < 0)
++ return err;
++
++ set_no_backward_pred(s);
++
++ return rpi_decode_entry(s->avctx, NULL);
++}
++
++static int set_side_data(HEVCRpiContext *s)
++{
++ AVFrame *out = s->ref->frame;
++
++ if (s->sei.frame_packing.present &&
++ s->sei.frame_packing.arrangement_type >= 3 &&
++ s->sei.frame_packing.arrangement_type <= 5 &&
++ s->sei.frame_packing.content_interpretation_type > 0 &&
++ s->sei.frame_packing.content_interpretation_type < 3) {
++ AVStereo3D *stereo = av_stereo3d_create_side_data(out);
++ if (!stereo)
++ return AVERROR(ENOMEM);
++
++ switch (s->sei.frame_packing.arrangement_type) {
++ case 3:
++ if (s->sei.frame_packing.quincunx_subsampling)
++ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
++ else
++ stereo->type = AV_STEREO3D_SIDEBYSIDE;
++ break;
++ case 4:
++ stereo->type = AV_STEREO3D_TOPBOTTOM;
++ break;
++ case 5:
++ stereo->type = AV_STEREO3D_FRAMESEQUENCE;
++ break;
++ }
++
++ if (s->sei.frame_packing.content_interpretation_type == 2)
++ stereo->flags = AV_STEREO3D_FLAG_INVERT;
++
++ if (s->sei.frame_packing.arrangement_type == 5) {
++ if (s->sei.frame_packing.current_frame_is_frame0_flag)
++ stereo->view = AV_STEREO3D_VIEW_LEFT;
++ else
++ stereo->view = AV_STEREO3D_VIEW_RIGHT;
++ }
++ }
++
++ if (s->sei.display_orientation.present &&
++ (s->sei.display_orientation.anticlockwise_rotation ||
++ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
++ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
++ AVFrameSideData *rotation = av_frame_new_side_data(out,
++ AV_FRAME_DATA_DISPLAYMATRIX,
++ sizeof(int32_t) * 9);
++ if (!rotation)
++ return AVERROR(ENOMEM);
++
++ av_display_rotation_set((int32_t *)rotation->data, angle);
++ av_display_matrix_flip((int32_t *)rotation->data,
++ s->sei.display_orientation.hflip,
++ s->sei.display_orientation.vflip);
++ }
++
++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++ // so the side data persists for the entire coded video sequence.
++ if (s->sei.mastering_display.present > 0 &&
++ IS_IRAP(s) && s->no_rasl_output_flag) {
++ s->sei.mastering_display.present--;
++ }
++ if (s->sei.mastering_display.present) {
++ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
++ const int mapping[3] = {2, 0, 1};
++ const int chroma_den = 50000;
++ const int luma_den = 10000;
++ int i;
++ AVMasteringDisplayMetadata *metadata =
++ av_mastering_display_metadata_create_side_data(out);
++ if (!metadata)
++ return AVERROR(ENOMEM);
++
++ for (i = 0; i < 3; i++) {
++ const int j = mapping[i];
++ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
++ metadata->display_primaries[i][0].den = chroma_den;
++ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
++ metadata->display_primaries[i][1].den = chroma_den;
++ }
++ metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
++ metadata->white_point[0].den = chroma_den;
++ metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
++ metadata->white_point[1].den = chroma_den;
++
++ metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
++ metadata->max_luminance.den = luma_den;
++ metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
++ metadata->min_luminance.den = luma_den;
++ metadata->has_luminance = 1;
++ metadata->has_primaries = 1;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
++ av_q2d(metadata->display_primaries[0][0]),
++ av_q2d(metadata->display_primaries[0][1]),
++ av_q2d(metadata->display_primaries[1][0]),
++ av_q2d(metadata->display_primaries[1][1]),
++ av_q2d(metadata->display_primaries[2][0]),
++ av_q2d(metadata->display_primaries[2][1]),
++ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "min_luminance=%f, max_luminance=%f\n",
++ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
++ }
++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++ // so the side data persists for the entire coded video sequence.
++ if (s->sei.content_light.present > 0 &&
++ IS_IRAP(s) && s->no_rasl_output_flag) {
++ s->sei.content_light.present--;
++ }
++ if (s->sei.content_light.present) {
++ AVContentLightMetadata *metadata =
++ av_content_light_metadata_create_side_data(out);
++ if (!metadata)
++ return AVERROR(ENOMEM);
++ metadata->MaxCLL = s->sei.content_light.max_content_light_level;
++ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
++ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
++ metadata->MaxCLL, metadata->MaxFALL);
++ }
++
++ if (s->sei.a53_caption.a53_caption) {
++ AVFrameSideData* sd = av_frame_new_side_data(out,
++ AV_FRAME_DATA_A53_CC,
++ s->sei.a53_caption.a53_caption_size);
++ if (sd)
++ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
++ av_freep(&s->sei.a53_caption.a53_caption);
++ s->sei.a53_caption.a53_caption_size = 0;
++ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
++ }
++
++ if (s->sei.alternative_transfer.present &&
++ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
++ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
++ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
++ }
++
++ return 0;
++}
++
++static int hevc_frame_start(HEVCRpiContext * const s)
++{
++ int ret;
++
++ memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too
++ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++ memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
++
++ // Only need to remember intra for CIP
++ if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
++ s->is_intra = NULL;
++ else
++ {
++ s->is_intra = s->is_intra_store;
++ memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++ }
++
++ s->is_decoded = 0;
++ s->first_nal_type = s->nal_unit_type;
++
++ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
++
++ if (s->pkt.nb_nals > s->rpl_tab_size)
++ {
++ // In most cases it will be faster to free & realloc as that doesn't
++ // require (an unwanted) copy
++ av_freep(&s->rpl_tab);
++ s->rpl_tab_size = 0;
++ if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
++ goto fail;
++ s->rpl_tab_size = s->pkt.nb_nals;
++ }
++ memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
++
++ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
++ if (ret < 0)
++ goto fail;
++
++ // Resize rpl_tab to max that we might want
++ ret = ff_hevc_rpi_frame_rps(s);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
++ goto fail;
++ }
++
++ s->ref->frame->key_frame = IS_IRAP(s);
++
++ ret = set_side_data(s);
++ if (ret < 0)
++ goto fail;
++
++ s->frame->pict_type = 3 - s->sh.slice_type;
++
++ if (!IS_IRAP(s))
++ ff_hevc_rpi_bump_frame(s);
++
++ av_frame_unref(s->output_frame);
++ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
++ if (ret < 0)
++ goto fail;
++
++ ff_thread_finish_setup(s->avctx);
++
++ return 0;
++
++fail:
++ if (s->ref)
++ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++ s->ref = NULL;
++ return ret;
++}
++
++static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
++{
++ // From Table 7-1
++ return (nal_unit_type & ~0xe) == 0; // True for 0, 2, 4, 6, 8, 10, 12, 14
++}
++
++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
++{
++ GetBitContext * const gb = &s->HEVClc->gb;
++ int ctb_addr_ts, ret;
++
++ *gb = nal->gb;
++ s->nal_unit_type = nal->type;
++ s->temporal_id = nal->temporal_id;
++
++ switch (s->nal_unit_type) {
++ case HEVC_NAL_VPS:
++ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_SPS:
++ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
++ s->apply_defdispwin);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_PPS:
++ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_SEI_PREFIX:
++ case HEVC_NAL_SEI_SUFFIX:
++ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_TRAIL_R:
++ case HEVC_NAL_TRAIL_N:
++ case HEVC_NAL_TSA_N:
++ case HEVC_NAL_TSA_R:
++ case HEVC_NAL_STSA_N:
++ case HEVC_NAL_STSA_R:
++ case HEVC_NAL_BLA_W_LP:
++ case HEVC_NAL_BLA_W_RADL:
++ case HEVC_NAL_BLA_N_LP:
++ case HEVC_NAL_IDR_W_RADL:
++ case HEVC_NAL_IDR_N_LP:
++ case HEVC_NAL_CRA_NUT:
++ case HEVC_NAL_RADL_N:
++ case HEVC_NAL_RADL_R:
++ case HEVC_NAL_RASL_N:
++ case HEVC_NAL_RASL_R:
++ ret = hls_slice_header(s);
++ if (ret < 0)
++ return ret;
++
++ // The definition of _N unit types is "non-reference for other frames
++ // with the same temporal_id" so they may/will be ref frames for pics
++ // with a higher temporal_id.
++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++ !is_non_ref_unit_type(s->nal_unit_type);
++ s->offload_recon = s->threads_type != 0 && s->used_for_ref;
++ s->is_irap = IS_IRAP(s);
++
++#if DEBUG_DECODE_N
++ {
++ static int z = 0;
++ if (IS_IDR(s)) {
++ z = 1;
++ }
++ if (z != 0 && z++ > DEBUG_DECODE_N) {
++ s->is_decoded = 0;
++ break;
++ }
++ }
++#endif
++ if (
++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
++ {
++ s->is_decoded = 0;
++ break;
++ }
++
++ if (s->sh.first_slice_in_pic_flag) {
++ if (s->max_ra == INT_MAX) {
++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
++ s->max_ra = s->poc;
++ } else {
++ if (IS_IDR(s))
++ s->max_ra = INT_MIN;
++ }
++ }
++
++ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
++ s->poc <= s->max_ra) {
++ s->is_decoded = 0;
++ break;
++ } else {
++ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
++ s->max_ra = INT_MIN;
++ }
++
++ ret = hevc_frame_start(s);
++ if (ret < 0)
++ return ret;
++ } else if (!s->ref) {
++ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
++ goto fail;
++ }
++
++ if (s->nal_unit_type != s->first_nal_type) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Non-matching NAL types of the VCL NALUs: %d %d\n",
++ s->first_nal_type, s->nal_unit_type);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (!s->sh.dependent_slice_segment_flag &&
++ s->sh.slice_type != HEVC_SLICE_I) {
++ ret = ff_hevc_rpi_slice_rpl(s);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Error constructing the reference lists for the current slice.\n");
++ goto fail;
++ }
++ }
++
++ ctb_addr_ts = hls_slice_data(s, nal);
++ if (ctb_addr_ts >= s->ps.sps->ctb_size) {
++ s->is_decoded = 1;
++ }
++
++ if (ctb_addr_ts < 0) {
++ ret = ctb_addr_ts;
++ goto fail;
++ }
++ break;
++ case HEVC_NAL_EOS_NUT:
++ case HEVC_NAL_EOB_NUT:
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ break;
++ case HEVC_NAL_AUD:
++ case HEVC_NAL_FD_NUT:
++ break;
++ default:
++ av_log(s->avctx, AV_LOG_INFO,
++ "Skipping NAL unit %d\n", s->nal_unit_type);
++ }
++
++ return 0;
++fail:
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return ret;
++ return 0;
++}
++
++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
++{
++ int i, ret = 0;
++ int eos_at_start = 1;
++
++ s->ref = NULL;
++ s->last_eos = s->eos;
++ s->eos = 0;
++
++ /* split the input packet into NAL units, so we know the upper bound on the
++ * number of slices in the frame */
++ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
++ s->nal_length_size, s->avctx->codec_id, 0, 0);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Error splitting the input into NAL units.\n");
++ return ret;
++ }
++
++ for (i = 0; i < s->pkt.nb_nals; i++) {
++ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
++ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
++ if (eos_at_start) {
++ s->last_eos = 1;
++ } else {
++ s->eos = 1;
++ }
++ } else {
++ eos_at_start = 0;
++ }
++ }
++
++ /* decode the NAL units */
++ for (i = 0; i < s->pkt.nb_nals; i++) {
++ ret = decode_nal_unit(s, &s->pkt.nals[i]);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Error parsing NAL unit #%d.\n", i);
++ goto fail;
++ }
++ }
++
++fail: // Also success path
++ if (s->ref != NULL) {
++ if (s->used_for_ref && s->threads_type != 0) {
++ ff_hevc_rpi_progress_signal_all_done(s);
++ }
++ else {
++ // Flush frame to real memory as we expect to be able to pass
++ // it straight on to mmal
++ flush_frame(s, s->frame);
++ }
++ }
++ return ret;
++}
++
++static void print_md5(void *log_ctx, int level, uint8_t md5[16])
++{
++ int i;
++ for (i = 0; i < 16; i++)
++ av_log(log_ctx, level, "%02"PRIx8, md5[i]);
++}
++
++static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
++{
++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++ int pixel_shift;
++ int i, j;
++
++ if (!desc)
++ return AVERROR(EINVAL);
++
++ pixel_shift = desc->comp[0].depth > 8;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
++ s->poc);
++
++ /* the checksums are LE, so we have to byteswap for >8bpp formats
++ * on BE arches */
++#if HAVE_BIGENDIAN
++ if (pixel_shift && !s->checksum_buf) {
++ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
++ FFMAX3(frame->linesize[0], frame->linesize[1],
++ frame->linesize[2]));
++ if (!s->checksum_buf)
++ return AVERROR(ENOMEM);
++ }
++#endif
++
++ for (i = 0; frame->data[i]; i++) {
++ int width = s->avctx->coded_width;
++ int height = s->avctx->coded_height;
++ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width;
++ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
++ uint8_t md5[16];
++
++ av_md5_init(s->md5_ctx);
++ for (j = 0; j < h; j++) {
++ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
++#if HAVE_BIGENDIAN
++ if (pixel_shift) {
++ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
++ (const uint16_t *) src, w);
++ src = s->checksum_buf;
++ }
++#endif
++ av_md5_update(s->md5_ctx, src, w << pixel_shift);
++ }
++ av_md5_final(s->md5_ctx, md5);
++
++ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
++ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
++ print_md5(s->avctx, AV_LOG_DEBUG, md5);
++ av_log (s->avctx, AV_LOG_DEBUG, "; ");
++ } else {
++ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
++ print_md5(s->avctx, AV_LOG_ERROR, md5);
++ av_log (s->avctx, AV_LOG_ERROR, " != ");
++ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
++ av_log (s->avctx, AV_LOG_ERROR, "\n");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ av_log(s->avctx, AV_LOG_DEBUG, "\n");
++
++ return 0;
++}
++
++static int all_sps_supported(const HEVCRpiContext * const s)
++{
++ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ if (s->ps.sps_list[i] != NULL)
++ {
++ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++ if (!is_sps_supported(sps))
++ return 0;
++ }
++ }
++ return 1;
++}
++
++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
++{
++ int ret, i;
++
++ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
++ &s->nal_length_size, s->avctx->err_recognition,
++ s->apply_defdispwin, s->avctx);
++ if (ret < 0)
++ return ret;
++
++ /* export stream parameters from the first SPS */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ if (first && s->ps.sps_list[i]) {
++ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++ export_stream_params(s->avctx, &s->ps, sps);
++ break;
++ }
++ }
++
++ return 0;
++}
++
++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
++ AVPacket *avpkt)
++{
++ int ret;
++ int new_extradata_size;
++ uint8_t *new_extradata;
++ HEVCRpiContext *s = avctx->priv_data;
++
++ if (!avpkt->size) {
++ ret = ff_hevc_rpi_output_frame(s, data, 1);
++ if (ret < 0)
++ return ret;
++
++ *got_output = ret;
++ return 0;
++ }
++
++ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
++ &new_extradata_size);
++ if (new_extradata && new_extradata_size > 0) {
++ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
++ if (ret < 0)
++ return ret;
++ }
++
++ s->ref = NULL;
++ ret = decode_nal_units(s, avpkt->data, avpkt->size);
++ if (ret < 0)
++ return ret;
++
++ /* verify the SEI checksum */
++ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
++ s->sei.picture_hash.is_md5) {
++ ret = verify_md5(s, s->ref->frame);
++ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
++ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++ return ret;
++ }
++ }
++ s->sei.picture_hash.is_md5 = 0;
++
++ if (s->is_decoded) {
++ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
++ s->is_decoded = 0;
++ }
++
++ if (s->output_frame->buf[0]) {
++ av_frame_move_ref(data, s->output_frame);
++ *got_output = 1;
++ }
++
++ return avpkt->size;
++}
++
++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
++{
++ int ret;
++
++ ret = ff_thread_ref_frame(&dst->tf, &src->tf);
++ if (ret < 0)
++ return ret;
++
++ if (src->col_mvf_buf != NULL)
++ {
++ dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
++ if (!dst->col_mvf_buf)
++ goto fail;
++ }
++ dst->col_mvf = src->col_mvf;
++
++ dst->poc = src->poc;
++ dst->flags = src->flags;
++ dst->sequence = src->sequence;
++ return 0;
++
++fail:
++ ff_hevc_rpi_unref_frame(s, dst, ~0);
++ return AVERROR(ENOMEM);
++}
++
++
++static av_cold int hevc_decode_free(AVCodecContext *avctx)
++{
++ HEVCRpiContext * const s = avctx->priv_data;
++ int i;
++
++ pic_arrays_free(s);
++
++ av_freep(&s->md5_ctx);
++
++ av_freep(&s->cabac_save);
++
++#if RPI_EXTRA_BIT_THREADS
++ bit_threads_kill(s);
++#endif
++
++ hevc_exit_worker(s);
++ for (i = 0; i != 2; ++i) {
++ ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++ }
++ job_lc_kill(s->HEVClc);
++
++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0]
++ av_freep(&s->sao_pixel_buffer_v[0]);
++ av_frame_free(&s->output_frame);
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++ av_frame_free(&s->DPB[i].frame);
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
++ av_buffer_unref(&s->ps.vps_list[i]);
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
++ av_buffer_unref(&s->ps.sps_list[i]);
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
++ av_buffer_unref(&s->ps.pps_list[i]);
++ s->ps.sps = NULL;
++ s->ps.pps = NULL;
++ s->ps.vps = NULL;
++
++ // Free separately from sLists as used that way by RPI WPP
++ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
++ av_freep(s->HEVClcList + i);
++ }
++ s->HEVClc = NULL; // Allocated as part of HEVClcList
++
++ ff_h2645_packet_uninit(&s->pkt);
++
++ if (s->qpu_init_ok)
++ vpu_qpu_term();
++ s->qpu_init_ok = 0;
++
++ return 0;
++}
++
++
++static av_cold int hevc_init_context(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ int i;
++
++ s->avctx = avctx;
++
++ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
++ if (!s->HEVClc)
++ goto fail;
++ s->HEVClcList[0] = s->HEVClc;
++
++ if (vpu_qpu_init() != 0)
++ goto fail;
++ s->qpu_init_ok = 1;
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ {
++ static const uint32_t dframe[1] = {0x80808080};
++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
++ }
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++ s->qpu_dummy_frame_qpu = qpu_dummy();
++#endif
++
++ bt_lc_init(s, s->HEVClc, 0);
++ job_lc_init(s->HEVClc);
++
++ for (i = 0; i != 2; ++i) {
++ ff_hevc_rpi_progress_init_state(s->progress_states + i);
++ }
++
++ if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
++ goto fail;
++
++ if ((s->output_frame = av_frame_alloc()) == NULL)
++ goto fail;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ s->DPB[i].frame = av_frame_alloc();
++ if (!s->DPB[i].frame)
++ goto fail;
++ s->DPB[i].tf.f = s->DPB[i].frame;
++ s->DPB[i].dpb_no = i;
++ }
++
++ s->max_ra = INT_MAX;
++
++ if ((s->md5_ctx = av_md5_alloc()) == NULL)
++ goto fail;
++
++ s->context_initialized = 1;
++ s->eos = 0;
++
++ ff_hevc_rpi_reset_sei(&s->sei);
++
++ return 0;
++
++fail:
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
++ hevc_decode_free(avctx);
++ return AVERROR(ENOMEM);
++}
++
++#if HAVE_THREADS
++static int hevc_update_thread_context(AVCodecContext *dst,
++ const AVCodecContext *src)
++{
++ HEVCRpiContext *s = dst->priv_data;
++ HEVCRpiContext *s0 = src->priv_data;
++ int i, ret;
++
++ av_assert0(s->context_initialized);
++
++ // dst == src can happen according to the comments and in that case
++ // there is nothing to do here
++ if (dst == src)
++ return 0;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++ if (s0->DPB[i].frame->buf[0]) {
++ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
++ if (ret < 0)
++ return ret;
++ }
++ }
++
++ if (s->ps.sps != s0->ps.sps)
++ s->ps.sps = NULL;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
++ av_buffer_unref(&s->ps.vps_list[i]);
++ if (s0->ps.vps_list[i]) {
++ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
++ if (!s->ps.vps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ av_buffer_unref(&s->ps.sps_list[i]);
++ if (s0->ps.sps_list[i]) {
++ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
++ if (!s->ps.sps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
++ av_buffer_unref(&s->ps.pps_list[i]);
++ if (s0->ps.pps_list[i]) {
++ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
++ if (!s->ps.pps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ if (s->ps.sps != s0->ps.sps)
++ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
++ return ret;
++
++ s->seq_decode = s0->seq_decode;
++ s->seq_output = s0->seq_output;
++ s->pocTid0 = s0->pocTid0;
++ s->max_ra = s0->max_ra;
++ s->eos = s0->eos;
++ s->no_rasl_output_flag = s0->no_rasl_output_flag;
++
++ s->is_nalff = s0->is_nalff;
++ s->nal_length_size = s0->nal_length_size;
++
++ s->threads_type = s0->threads_type;
++
++ if (s0->eos) {
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ }
++
++ s->sei.frame_packing = s0->sei.frame_packing;
++ s->sei.display_orientation = s0->sei.display_orientation;
++ s->sei.mastering_display = s0->sei.mastering_display;
++ s->sei.content_light = s0->sei.content_light;
++ s->sei.alternative_transfer = s0->sei.alternative_transfer;
++
++ // * We do this here as it allows us to easily locate our parents
++ // global job pool, but there really should be a less nasty way
++ if (s->jbc == NULL)
++ {
++ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
++ hevc_init_worker(s);
++ }
++
++ return 0;
++}
++#endif
++
++#include <sys/stat.h>
++static int qpu_ok(void)
++{
++ static int is_pi3 = -1;
++ if (is_pi3 == -1)
++ {
++ struct stat sb;
++ is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
++ }
++ return is_pi3;
++}
++
++static av_cold int hevc_decode_init(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ int ret;
++
++ if (!qpu_ok())
++ return AVERROR_DECODER_NOT_FOUND;
++
++ if ((ret = hevc_init_context(avctx)) < 0)
++ return ret;
++
++ // If we are a child context then stop now
++ // Everything after this point is either 1st decode setup or global alloc
++ // that must not be repeated
++ // Global info will be copied into children in update_thread_context (we
++ // can't do it here as we have no way of finding the parent context)
++ if (avctx->internal->is_copy)
++ return 0;
++
++ // Job allocation requires VCSM alloc to work so ensure that we have it
++ // initialised by this point
++ {
++ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
++ if (jbg == NULL) {
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
++
++ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) {
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
++ }
++
++ hevc_init_worker(s);
++
++ s->eos = 1;
++
++ if (avctx->extradata_size > 0 && avctx->extradata) {
++ if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0)
++ goto fail;
++
++ if (!all_sps_supported(s)) {
++ ret = AVERROR_DECODER_NOT_FOUND;
++ goto fail;
++ }
++ }
++
++ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++ s->threads_type = FF_THREAD_FRAME;
++ else
++ s->threads_type = 0;
++
++ return 0;
++
++fail:
++ hevc_decode_free(avctx);
++ return ret;
++}
++
++static void hevc_decode_flush(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ ff_hevc_rpi_flush_dpb(s);
++ s->max_ra = INT_MAX;
++ s->eos = 1;
++}
++
++typedef struct hwaccel_rpi3_qpu_env_s {
++ const AVClass *av_class;
++ AVZcEnvPtr zc;
++} hwaccel_rpi3_qpu_env_t;
++
++static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
++{
++ hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
++ int rv;
++
++ if (av_rpi_zc_in_use(s))
++ {
++ rv = s->get_buffer2(s, frame, 0);
++ }
++ else
++ {
++ rv = av_rpi_zc_get_buffer(r3->zc, frame);
++ if (rv == 0)
++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); // actually do the alloc
++ }
++
++ if (rv == 0 &&
++ (rv = ff_attach_decode_data(frame)) < 0)
++ {
++ av_frame_unref(frame);
++ }
++
++ return rv;
++}
++
++static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
++{
++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++ av_rpi_zc_int_env_freep(&r3->zc);
++ return 0;
++}
++
++static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
++{
++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++
++ if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
++ goto fail;
++
++ return 0;
++
++fail:
++ av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
++ hwaccel_rpi3_qpu_free(avctx);
++ return AVERROR(ENOMEM);
++}
++
++
++#define OFFSET(x) offsetof(HEVCRpiContext, x)
++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
++
++
++static const AVOption options[] = {
++ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++ { NULL },
++};
++
++static const AVClass hevc_rpi_decoder_class = {
++ .class_name = "HEVC RPI decoder",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++};
++
++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
++ AV_PIX_FMT_SAND128,
++ AV_PIX_FMT_SAND64_10,
++ AV_PIX_FMT_NONE
++};
++
++
++static const AVHWAccel hwaccel_rpi3_qpu = {
++ .name = "Pi3 QPU Hwaccel",
++ .alloc_frame = hwaccel_alloc_frame,
++ .init = hwaccel_rpi3_qpu_init,
++ .uninit = hwaccel_rpi3_qpu_free,
++ .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
++{
++ .public = {
++ .pix_fmt = AV_PIX_FMT_SAND128,
++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++ .device_type = AV_HWDEVICE_TYPE_NONE,
++ },
++ .hwaccel = &hwaccel_rpi3_qpu
++};
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
++{
++ .public = {
++ .pix_fmt = AV_PIX_FMT_SAND64_10,
++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++ .device_type = AV_HWDEVICE_TYPE_NONE,
++ },
++ .hwaccel = &hwaccel_rpi3_qpu
++};
++
++
++static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
++ &hevc_rpi_hw_config_sand128,
++ &hevc_rpi_hw_config_sand64_10,
++ NULL
++};
++
++
++AVCodec ff_hevc_rpi_decoder = {
++ .name = "hevc_rpi",
++ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .priv_data_size = sizeof(HEVCRpiContext),
++ .priv_class = &hevc_rpi_decoder_class,
++ .init = hevc_decode_init,
++ .close = hevc_decode_free,
++ .decode = hevc_rpi_decode_frame,
++ .flush = hevc_decode_flush,
++ .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
++ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++ AV_CODEC_CAP_HARDWARE |
++ AV_CODEC_CAP_AVOID_PROBING |
++#if 0
++ // Debugging is often easier without threads getting in the way
++ 0,
++#warning H265 threading turned off
++#else
++ // We only have decent optimisation for frame - so only admit to that
++ AV_CODEC_CAP_FRAME_THREADS,
++#endif
++ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE |
++ FF_CODEC_CAP_EXPORTS_CROPPING |
++ FF_CODEC_CAP_ALLOCATE_PROGRESS,
++ .pix_fmts = hevc_rpi_pix_fmts,
++ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++ .hw_configs = hevc_rpi_hw_configs,
++// .wrapper_name = "hevc_rpi",
++};
++
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.h
+@@ -0,0 +1,1091 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDEC_H
++#define AVCODEC_RPI_HEVCDEC_H
++
++#include "config.h"
++
++#include <stdatomic.h>
++
++#include "libavutil/buffer.h"
++
++#include "avcodec.h"
++#include "bswapdsp.h"
++#include "cabac.h"
++#include "get_bits.h"
++#include "rpi_hevcpred.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_mv.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++#include "rpi_hevcdsp.h"
++#include "internal.h"
++#include "thread.h"
++#include "videodsp.h"
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_misc_neon.h"
++#endif
++
++#define MAX_NB_THREADS 16
++#define SHIFT_CTB_WPP 2
++
++//TODO: check if this is really the maximum
++#define MAX_TRANSFORM_DEPTH 5
++
++#define MAX_TB_SIZE 32
++#define MAX_QP 51
++#define DEFAULT_INTRA_TC_OFFSET 2
++
++#define HEVC_CONTEXTS 199
++
++#define MRG_MAX_NUM_CANDS 5
++
++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64
++
++// Size of DPB array
++#define HEVC_DPB_ELS 32
++
++#define L0 0
++#define L1 1
++
++#define EPEL_EXTRA_BEFORE 1
++#define EPEL_EXTRA_AFTER 2
++#define EPEL_EXTRA 3
++#define QPEL_EXTRA_BEFORE 3
++#define QPEL_EXTRA_AFTER 4
++#define QPEL_EXTRA 7
++
++#define EDGE_EMU_BUFFER_STRIDE 80
++
++#include <semaphore.h>
++#include "rpi_qpu.h"
++
++// Max jobs per frame thread. Actual usage will be limited by the size
++// of the global job pool
++// ?? Limits
++#define RPI_MAX_JOBS 8
++
++// This is the number of _extra_ bit threads - we will have
++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
++//
++// 0 is legitimate and will disable our WPP processing
++//#define RPI_EXTRA_BIT_THREADS 0
++#define RPI_EXTRA_BIT_THREADS 2
++
++// Number of separate threads/passes in worker
++// 2 and 3 are the currently valid numbers
++// At the moment 3 seems fractionally faster
++//#define RPI_PASSES 2
++#define RPI_PASSES 3
++
++// Print out various usage stats
++#define RPI_TSTATS 0
++
++// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
++#define RPI_COMPRESS_COEFFS 1
++
++// Wait for VPU/QPU to finish in worker pass 0
++// If 0 then the wait is in pass 1
++//
++// One might expect the better place to wait would be in pass 1 however
++// testing shows that pass 0 produces overall faster decode.
++// Interestingly it is QPU/VPU limited streams that seem to suffer
++// from pass 1 waits, CPU limited ones tend to show a very mild gain.
++// This define exists so it is easy to test this.
++#define RPI_WORKER_WAIT_PASS_0 1
++
++// Use ARM emulation of QPU pred
++// These are for debug only as the emulation makes only limited
++// effort to be fast
++#define RPI_QPU_EMU_Y 0
++#define RPI_QPU_EMU_C 0
++
++// Max width & height we are prepared to consider
++// Sand frame shape calc becomes confused with large frames
++// Some buffer alloc also depends on this
++#define HEVC_RPI_MAX_WIDTH 2048
++#define HEVC_RPI_MAX_HEIGHT 1088
++
++
++// Min CTB size is 16
++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
++
++/**
++ * Value of the luma sample at position (x, y) in the 2D array tab.
++ */
++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
++
++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
++ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
++
++enum RPSType {
++ ST_CURR_BEF = 0,
++ ST_CURR_AFT,
++ ST_FOLL,
++ LT_CURR,
++ LT_FOLL,
++ NB_RPS_TYPE,
++};
++
++enum SyntaxElement {
++ SAO_MERGE_FLAG = 0,
++ SAO_TYPE_IDX,
++ SAO_EO_CLASS,
++ SAO_BAND_POSITION,
++ SAO_OFFSET_ABS,
++ SAO_OFFSET_SIGN,
++ END_OF_SLICE_FLAG,
++ SPLIT_CODING_UNIT_FLAG,
++ CU_TRANSQUANT_BYPASS_FLAG,
++ SKIP_FLAG,
++ CU_QP_DELTA,
++ PRED_MODE_FLAG,
++ PART_MODE,
++ PCM_FLAG,
++ PREV_INTRA_LUMA_PRED_FLAG,
++ MPM_IDX,
++ REM_INTRA_LUMA_PRED_MODE,
++ INTRA_CHROMA_PRED_MODE,
++ MERGE_FLAG,
++ MERGE_IDX,
++ INTER_PRED_IDC,
++ REF_IDX_L0,
++ REF_IDX_L1,
++ ABS_MVD_GREATER0_FLAG,
++ ABS_MVD_GREATER1_FLAG,
++ ABS_MVD_MINUS2,
++ MVD_SIGN_FLAG,
++ MVP_LX_FLAG,
++ NO_RESIDUAL_DATA_FLAG,
++ SPLIT_TRANSFORM_FLAG,
++ CBF_LUMA,
++ CBF_CB_CR,
++ TRANSFORM_SKIP_FLAG,
++ EXPLICIT_RDPCM_FLAG,
++ EXPLICIT_RDPCM_DIR_FLAG,
++ LAST_SIGNIFICANT_COEFF_X_PREFIX,
++ LAST_SIGNIFICANT_COEFF_Y_PREFIX,
++ LAST_SIGNIFICANT_COEFF_X_SUFFIX,
++ LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
++ SIGNIFICANT_COEFF_GROUP_FLAG,
++ SIGNIFICANT_COEFF_FLAG,
++ COEFF_ABS_LEVEL_GREATER1_FLAG,
++ COEFF_ABS_LEVEL_GREATER2_FLAG,
++ COEFF_ABS_LEVEL_REMAINING,
++ COEFF_SIGN_FLAG,
++ LOG2_RES_SCALE_ABS,
++ RES_SCALE_SIGN_FLAG,
++ CU_CHROMA_QP_OFFSET_FLAG,
++ CU_CHROMA_QP_OFFSET_IDX,
++};
++
++enum PartMode {
++ PART_2Nx2N = 0,
++ PART_2NxN = 1,
++ PART_Nx2N = 2,
++ PART_NxN = 3,
++ PART_2NxnU = 4,
++ PART_2NxnD = 5,
++ PART_nLx2N = 6,
++ PART_nRx2N = 7,
++};
++
++enum PredMode {
++ MODE_INTER = 0,
++ MODE_INTRA,
++ MODE_SKIP,
++};
++
++enum InterPredIdc {
++ PRED_L0 = 0,
++ PRED_L1,
++ PRED_BI,
++};
++
++enum PredFlag {
++ PF_INTRA = 0,
++ PF_L0,
++ PF_L1,
++ PF_BI,
++};
++
++enum SAOType {
++ SAO_NOT_APPLIED = 0,
++ SAO_BAND,
++ SAO_EDGE,
++ SAO_APPLIED
++};
++
++enum SAOEOClass {
++ SAO_EO_HORIZ = 0,
++ SAO_EO_VERT,
++ SAO_EO_135D,
++ SAO_EO_45D,
++};
++
++enum ScanType {
++ SCAN_DIAG = 0,
++ SCAN_HORIZ,
++ SCAN_VERT,
++};
++
++typedef struct RefPicList {
++ struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
++ int list[HEVC_MAX_REFS];
++ uint8_t isLongTerm[HEVC_MAX_REFS];
++ int nb_refs;
++} RefPicList;
++
++typedef struct RefPicListTab {
++ RefPicList refPicList[2];
++} RefPicListTab;
++
++typedef struct RpiCodingUnit {
++ unsigned int x; // Passed to deblock
++ unsigned int y;
++ unsigned int x_split;
++ unsigned int y_split;
++
++ enum PredMode pred_mode; ///< PredMode
++ enum PartMode part_mode; ///< PartMode
++
++ // Inferred parameters
++ uint8_t intra_split_flag; ///< IntraSplitFlag
++ uint8_t max_trafo_depth; ///< MaxTrafoDepth
++ uint8_t cu_transquant_bypass_flag;
++} RpiCodingUnit;
++
++typedef struct RpiPredictionUnit {
++ uint8_t intra_pred_mode[4];
++ uint8_t intra_pred_mode_c[4];
++ uint8_t chroma_mode_c[4];
++ uint8_t merge_flag;
++} RpiPredictionUnit;
++
++typedef struct HEVCRpiTransformUnit {
++ int8_t cu_qp_delta;
++
++ // Inferred parameters;
++ uint8_t intra_pred_mode;
++ uint8_t intra_pred_mode_c;
++ uint8_t chroma_mode_c;
++ uint8_t is_cu_qp_delta_wanted;
++ uint8_t cu_chroma_qp_offset_wanted;
++ const int8_t * qp_divmod6[3];
++} HEVCRpiTransformUnit;
++
++typedef struct DBParams {
++ int8_t beta_offset; // -12 to +12
++ int8_t tc_offset; // -12 to +12
++} DBParams;
++
++#define HEVC_FRAME_FLAG_OUTPUT (1 << 0)
++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
++#define HEVC_FRAME_FLAG_LONG_REF (1 << 2)
++#define HEVC_FRAME_FLAG_BUMPING (1 << 3)
++
++struct HEVCRpiJob;
++
++typedef struct HEVCRpiFrame {
++ AVFrame *frame;
++ ThreadFrame tf;
++ ColMvField *col_mvf;
++ int poc;
++ struct HEVCRpiFrame *collocated_ref;
++
++ AVBufferRef *col_mvf_buf;
++
++ /**
++ * A sequence counter, so that old frames are output first
++ * after a POC reset
++ */
++ uint16_t sequence;
++
++ /**
++ * A combination of HEVC_FRAME_FLAG_*
++ */
++ uint8_t flags;
++
++ // Entry no in DPB - can be used as a small unique
++ // frame identifier (within the current thread)
++ uint8_t dpb_no;
++} HEVCRpiFrame;
++
++typedef struct HEVCRpiLocalContext {
++ HEVCRpiTransformUnit tu;
++
++ CABACContext cc;
++
++ // Vars that allow us to locate everything from just an lc
++ struct HEVCRpiContext * context; // ??? make const ???
++ unsigned int lc_n; // lc list el no
++
++ // Job wait links
++ struct HEVCRpiLocalContext * jw_next;
++ struct HEVCRpiLocalContext * jw_prev;
++ struct HEVCRpiLocalContext * ljw_next;
++ struct HEVCRpiLocalContext * ljw_prev;
++ struct HEVCRpiJob * volatile jw_job;
++ sem_t jw_sem;
++
++ // ?? Wrap in structure ??
++ sem_t bt_sem_in;
++ sem_t * bt_psem_out;
++ volatile int bt_terminate;
++ unsigned int ts;
++ unsigned int bt_last_line; // Last line in this bit_thread chunk
++ unsigned int bt_line_no;
++ unsigned int bt_line_width;
++ unsigned int bt_line_inc;
++
++ struct HEVCRpiJob * jb0;
++ char unit_done; // Set once we have dealt with this slice
++ char bt_is_tile;
++ char last_progress_good;
++ char cabac_init_req;
++
++ uint8_t cabac_state[HEVC_CONTEXTS];
++ uint8_t stat_coeff[4];
++ GetBitContext gb;
++
++ uint8_t ct_depth;
++ int8_t qp_y;
++ int8_t curr_qp_y;
++ int8_t qPy_pred;
++
++// N.B. Used by asm (neon) - do not change
++#define AVAIL_S_UR 0
++#define AVAIL_S_U 1
++#define AVAIL_S_UL 2
++#define AVAIL_S_L 3
++#define AVAIL_S_DL 4
++
++#define AVAIL_U (1 << AVAIL_S_U)
++#define AVAIL_L (1 << AVAIL_S_L)
++#define AVAIL_UL (1 << AVAIL_S_UL)
++#define AVAIL_UR (1 << AVAIL_S_UR)
++#define AVAIL_DL (1 << AVAIL_S_DL)
++
++// Intra filters - same number space as avail
++#define FILTER_LIGHT 0x40
++#define FILTER_STRONG 0x80
++#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG)
++
++ uint8_t ctb_avail;
++ int end_of_ctb_x;
++ int end_of_ctb_y;
++
++ RpiCodingUnit cu;
++ RpiPredictionUnit pu;
++
++#define BOUNDARY_LEFT_SLICE (1 << 0)
++#define BOUNDARY_LEFT_TILE (1 << 1)
++#define BOUNDARY_UPPER_SLICE (1 << 2)
++#define BOUNDARY_UPPER_TILE (1 << 3)
++ /* properties of the boundary of the current CTB for the purposes
++ * of the deblocking filter */
++ unsigned int boundary_flags;
++
++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
++ uint8_t ipm_left[IPM_TAB_SIZE];
++ uint8_t ipm_up[IPM_TAB_SIZE];
++
++//#define MVF_STASH_WIDTH 128
++#define MVF_STASH_WIDTH 64
++#define MVF_STASH_HEIGHT 64
++#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
++#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
++ HEVCRpiMvField mvf_ul[1];
++ HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
++
++ /* +7 is for subpixel interpolation, *2 for high bit depths */
++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++ /* The extended size between the new edge emu buffer is abused by SAO */
++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++// DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++
++} HEVCRpiLocalContext;
++
++// Each block can have an intra prediction and an add_residual command
++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
++
++// Sand only has 2 planes (Y/C)
++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
++
++// Command for intra prediction and transform_add of predictions to coefficients
++enum rpi_pred_cmd_e
++{
++ RPI_PRED_ADD_RESIDUAL,
++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++ RPI_PRED_ADD_DC,
++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C
++ RPI_PRED_ADD_DC_V,
++ RPI_PRED_INTRA,
++ RPI_PRED_INTRA_C,
++ RPI_PRED_I_PCM,
++ RPI_PRED_CMD_MAX
++};
++
++typedef struct HEVCPredCmd {
++ uint8_t type;
++ uint8_t size; // log2 "size" used by all variants
++ uint8_t avail; // i_pred - but left here as they pack well
++ uint8_t dummy;
++ union {
++ struct { // TRANSFORM_ADD
++ uint8_t * dst;
++ const int16_t * buf;
++ uint16_t stride; // Should be good enough for all pic fmts we use
++ int16_t dc;
++ } ta;
++ struct {
++ uint8_t * dst;
++ uint32_t stride;
++ int dc;
++ } dc;
++ struct { // INTRA
++ uint16_t x;
++ uint16_t y;
++ enum IntraPredMode mode;
++ } i_pred;
++ struct { // I_PCM
++ uint16_t x;
++ uint16_t y;
++ const void * src;
++ uint32_t src_len;
++ } i_pcm;
++ };
++} HEVCPredCmd;
++
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
++
++typedef struct HEVCRpiInterPredQ
++{
++ union qpu_mc_pred_cmd_u *qpu_mc_base;
++ union qpu_mc_pred_cmd_u *qpu_mc_curr;
++ struct qpu_mc_src_s *last_l0;
++ struct qpu_mc_src_s *last_l1;
++ unsigned int load;
++ uint32_t code_setup;
++ uint32_t code_sync;
++ uint32_t code_exit;
++} HEVCRpiInterPredQ;
++
++typedef struct HEVCRpiInterPredEnv
++{
++ HEVCRpiInterPredQ * q;
++ uint8_t n; // Number of Qs
++ uint8_t n_grp; // Number of Q in a group
++ uint8_t curr; // Current Q number (0..n-1)
++ uint8_t used; // 0 if nothing in any Q, 1 otherwise
++ uint8_t used_grp; // 0 if nothing in any Q in the current group
++ unsigned int max_fill;
++ unsigned int min_gap;
++ GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++ unsigned int n; // Number of commands
++ HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCoeffEnv {
++ unsigned int n;
++#if RPI_COMPRESS_COEFFS
++ unsigned int packed; // Equal to 1 if coefficients should be being packed
++ unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed). Only valid if packed==0
++#endif
++ int16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCoeffsEnv {
++ HEVCRpiCoeffEnv s[4];
++ GPU_MEM_PTR_T gptr;
++ void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiFrameProgressWait {
++ int req;
++ struct HEVCRpiFrameProgressWait * next;
++ sem_t sem;
++} HEVCRpiFrameProgressWait;
++
++typedef struct HEVCRpiFrameProgressState {
++ struct HEVCRpiFrameProgressWait * first;
++ struct HEVCRpiFrameProgressWait * last;
++ pthread_mutex_t lock;
++} HEVCRpiFrameProgressState;
++
++typedef struct RpiBlk
++{
++ unsigned int x;
++ unsigned int y;
++ unsigned int w;
++ unsigned int h;
++} RpiBlk;
++
++typedef struct HEVCRpiJob {
++ struct HEVCRpiJob * next; // Free chain
++ struct HEVCRpiJobCtl * jbc_local;
++ const HEVCRpiSPS * sps; // sps used to set up this job
++
++ int waited;
++ int ctu_ts_first;
++ int ctu_ts_last;
++ RpiBlk bounds; // Bounding box of job
++
++ struct qpu_mc_pred_y_p_s * last_y8_p;
++ struct qpu_mc_src_s * last_y8_l1;
++ rpi_cache_flush_env_t * rfe;
++
++ HEVCRpiInterPredEnv chroma_ip;
++ HEVCRpiInterPredEnv luma_ip;
++ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
++ HEVCRpiIntraPredEnv intra;
++ HEVCRpiCoeffsEnv coeffs;
++ HEVCRpiFrameProgressWait progress_wait;
++ sem_t sem;
++ rpi_cache_buf_t flush_buf;
++} HEVCRpiJob;
++
++struct HEVCRpiContext;
++
++typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
++
++typedef struct HEVCRpiPassQueue
++{
++// int pending;
++ volatile int terminate;
++ sem_t sem_in;
++ sem_t * psem_out;
++ unsigned int job_n;
++ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
++ HEVCRpiWorkerFn * worker;
++ pthread_t thread;
++ uint8_t pass_n; // Pass number - debug
++ uint8_t started;
++} HEVCRpiPassQueue;
++
++
++struct HEVCRpiJobGlobal;
++
++typedef struct HEVCRpiJobCtl
++{
++ sem_t sem_out;
++
++ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated
++ struct HEVCRpiJobGlobal * jbg;
++
++ HEVCRpiLocalContext * lcw_head;
++ HEVCRpiLocalContext * lcw_tail;
++
++ pthread_mutex_t in_lock;
++ int offload_in;
++
++ HEVCRpiJob *offloadq[RPI_MAX_JOBS];
++} HEVCRpiJobCtl;
++
++
++typedef struct HEVCRpiJobGlobal
++{
++ intptr_t ref_count;
++ pthread_mutex_t lock;
++ HEVCRpiJob * free1; // Singly linked list of free jobs
++ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job
++ HEVCRpiLocalContext * wait_good; // Last good tail
++ HEVCRpiLocalContext * wait_tail;
++
++} HEVCRpiJobGlobal;
++
++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++ int y_pred1_y8_merge;
++ int y_pred1_xy;
++ int y_pred1_x0;
++ int y_pred1_y0;
++ int y_pred1_x0y0;
++ int y_pred1_wle8;
++ int y_pred1_wgt8;
++ int y_pred1_hle16;
++ int y_pred1_hgt16;
++ int y_pred2_xy;
++ int y_pred2_x0;
++ int y_pred2_y0;
++ int y_pred2_x0y0;
++ int y_pred2_hle16;
++ int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++typedef struct HEVCRpiCabacState
++{
++ uint8_t rice[4];
++ uint8_t state[HEVC_CONTEXTS];
++} HEVCRpiCabacState;
++
++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels
++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1)
++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte
++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el
++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row
++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++
++typedef struct HEVCRpiContext {
++ const AVClass *c; // needed by private avoptions
++ AVCodecContext *avctx;
++
++ uint8_t threads_type;
++ char qpu_init_ok;
++
++ /** 1 if the independent slice segment header was successfully parsed */
++ uint8_t slice_initialized;
++ char used_for_ref; // rpi
++ char is_irap;
++ char offload_recon;
++ uint8_t eos; ///< current packet contains an EOS/EOB NAL
++ uint8_t last_eos; ///< last packet contains an EOS/EOB NAL
++ uint8_t no_backward_pred_flag;
++ uint8_t is_decoded;
++ uint8_t no_rasl_output_flag;
++
++
++ /**
++ * Sequence counters for decoded and output frames, so that old
++ * frames are output first after a POC reset
++ */
++ uint16_t seq_decode;
++ uint16_t seq_output;
++
++ int width;
++ int height;
++
++ HEVCRpiJobCtl * jbc;
++ // cabac stash
++ // b0 skip flag
++ // b1+ ct_depth
++ uint8_t * cabac_stash_left;
++ uint8_t * cabac_stash_up;
++
++ // Function pointers
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory
++#endif
++ HEVCRpiQpu qpu;
++
++ HEVCRpiFrameProgressState progress_states[2];
++
++ HEVCRpiCabacState *cabac_save;
++
++ AVFrame *frame;
++ AVFrame *output_frame;
++ uint8_t *sao_pixel_buffer_h[3];
++ uint8_t *sao_pixel_buffer_v[3];
++
++ unsigned int col_mvf_stride;
++ AVBufferPool *col_mvf_pool;
++
++ RpiSAOParams *sao;
++ DBParams *deblock;
++ enum HEVCNALUnitType nal_unit_type;
++ int temporal_id; ///< temporal_id_plus1 - 1
++ HEVCRpiFrame *ref;
++ int poc;
++ int pocTid0;
++ int slice_idx; ///< number of the slice being currently decoded
++ int max_ra;
++
++ int8_t *qp_y_tab;
++
++ // Deblocking block strength bitmaps
++ unsigned int bs_stride2;
++ unsigned int bs_size;
++ uint8_t *bs_horizontal;
++ uint8_t *bs_vertical;
++ uint8_t *bsf_stash_up;
++ uint8_t *bsf_stash_left;
++
++#if HEVC_RPI_MAX_CTBS >= 0xffff
++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
++ uint32_t *tab_slice_address;
++#else
++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
++ uint16_t *tab_slice_address;
++#endif
++
++ // Bitfield 1 bit per 8 pels (min pcm size)
++ uint8_t *is_pcm;
++ // Bitfield 1 bit per 8 pels (min cb size)
++ // Only needed for CIP as CIP processing is async to the main thread
++ uint8_t *is_intra;
++
++ // PU
++ HEVCRpiMvField *mvf_up;
++ HEVCRpiMvField *mvf_left;
++
++ const RefPicList **rpl_up;
++ const RefPicList **rpl_left;
++ RefPicList * refPicList;
++
++ // CTB-level flags affecting loop filter operation
++ uint8_t *filter_slice_edges;
++
++ /** used on BE to byteswap the lines for checksumming */
++ uint8_t *checksum_buf;
++ int checksum_buf_size;
++
++ const uint8_t *data;
++
++ H2645Packet pkt;
++ // type of the first VCL NAL of the current frame
++ enum HEVCNALUnitType first_nal_type;
++
++ uint8_t context_initialized;
++ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated
++ ///< as a format defined in 14496-15
++ int apply_defdispwin;
++
++ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
++ int nuh_layer_id;
++
++ struct AVMD5 *md5_ctx;
++
++ RefPicListTab * rpl_tab;
++ unsigned int rpl_tab_size;
++
++ uint8_t *is_intra_store;
++
++ RpiSliceHeader sh;
++
++ HEVCRpiParamSets ps;
++
++ HEVCRpiLocalContext *HEVClc;
++ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS];
++
++ HEVCRpiFrame DPB[HEVC_DPB_ELS];
++
++ ///< candidate references for the current frame
++ RefPicList rps[5];
++
++ HEVCRpiPredContext hpc;
++ HEVCDSPContext hevcdsp;
++
++ HEVCSEIContext sei;
++
++ // Put structures that allocate non-trivial storage at the end
++ // These are mostly used indirectly so position in the structure doesn't matter
++ HEVCRpiPassQueue passq[RPI_PASSES];
++#if RPI_EXTRA_BIT_THREADS > 0
++ int bt_started;
++ // This simply contains thread descriptors - task setup is held elsewhere
++ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
++#endif
++#if RPI_TSTATS
++ HEVCRpiStats tstats;
++#endif
++} HEVCRpiContext;
++
++/**
++ * Mark all frames in DPB as unused for reference.
++ */
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
++
++/**
++ * Drop all frames currently in DPB.
++ */
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture sets for the current frame.
++ */
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture list(s) for the current slice.
++ */
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
++
++
++/**
++ * Get the number of candidate references for the current frame.
++ */
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
++
++/**
++ * Find next frame in output order and put a reference to it in frame.
++ * @return 1 if a frame was output, 0 otherwise
++ */
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
++
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++ int nPbH, int log2_cb_size, int part_idx,
++ int merge_idx, HEVCRpiMvField * const mv);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int avail,
++ HEVCRpiMvField * const mv,
++ const unsigned int mvp_lx_flag, const unsigned int LX);
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_trafo_size, const int is_coded_block);
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
++
++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra[4];
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int y)
++{
++ if (s->threads_type != 0)
++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
++{
++ if (s->used_for_ref && s->threads_type != 0)
++ ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int y)
++{
++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++}
++
++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
++{
++ if (s->used_for_ref && s->threads_type != 0)
++ {
++ ff_hevc_rpi_progress_signal_field(s, y, 0);
++ }
++}
++
++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
++{
++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++}
++
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
++{
++ if (ref->tf.progress != NULL)
++ {
++ int * const p = (int *)ref->tf.progress->data;
++ p[0] = INT_MAX;
++ p[1] = INT_MAX;
++ }
++}
++
++#define HEVC_RPI_420_ONLY 1
++#define HEVC_RPI_SAND128_ONLY 1
++
++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++ return cidx == 0 ? 0 : 1;
++#else
++ return s->ps.sps->hshift[cidx];
++#endif
++}
++
++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++ return cidx == 0 ? 0 : 1;
++#else
++ return s->ps.sps->vshift[cidx];
++#endif
++}
++
++static inline int ctx_cfmt(const HEVCRpiContext * const s)
++{
++#if HEVC_RPI_420_ONLY
++ return 1;
++#else
++ return s->ps.sps->chroma_format_idc;
++#endif
++}
++
++static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
++{
++#if HEVC_RPI_SAND128_ONLY
++ return 128;
++#else
++ return frame->linesize[c_idx];
++#endif
++}
++
++#if HEVC_RPI_SAND128_ONLY
++// Propagate this decision to later zc includes
++#define RPI_ZC_SAND128_ONLY 1
++#endif
++
++#ifndef ff_hevc_rpi_copy_vert
++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++ int i;
++ switch (pixel_shift)
++ {
++ case 2:
++ for (i = 0; i < height; i++) {
++ *(uint32_t *)dst = *(uint32_t *)src;
++ dst += stride_dst;
++ src += stride_src;
++ }
++ break;
++ case 1:
++ for (i = 0; i < height; i++) {
++ *(uint16_t *)dst = *(uint16_t *)src;
++ dst += stride_dst;
++ src += stride_src;
++ }
++ break;
++ default:
++ for (i = 0; i < height; i++) {
++ *dst = *src;
++ dst += stride_dst;
++ src += stride_src;
++ }
++ break;
++ }
++}
++#endif
++
++
++#if MVF_STASH_WIDTH == 64
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ const unsigned int x0_ctb = x0 & mask_cs_hi;
++ const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++ return (HEVCRpiMvField *)((y < y0_ctb) ?
++ (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
++ (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
++ lc->mvf_stash +
++ ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
++ ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++ const unsigned int x0,
++ const unsigned int x)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ const unsigned int x0_ctb = x0 & mask_cs_hi;
++ return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
++}
++
++#else
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++
++ const unsigned int x0_ctb = x0 & mask_cs_hi;
++ const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++ // If not in the same CTB for Y assume up
++ if (y < y0_ctb) {
++ // If not in the same CTB for X too assume up-left
++ return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
++ }
++ return mvf_stash_ptr(s, lc, x, y);
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++ const unsigned int x0,
++ const unsigned int x)
++{
++ return MVF_STASH_WIDTH_PU;
++}
++#endif
++
++#endif /* AVCODEC_RPI_HEVCDEC_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.c
+@@ -0,0 +1,450 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdsp.h"
++#include "rpi_hevc_mv.h"
++
++static const int8_t transform[32][32] = {
++ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
++ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
++ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
++ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90,
++ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 },
++ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
++ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 },
++ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89,
++ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
++ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
++ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 },
++ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87,
++ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 },
++ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
++ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 },
++ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83,
++ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
++ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
++ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 },
++ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80,
++ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 },
++ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
++ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 },
++ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75,
++ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
++ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
++ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 },
++ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70,
++ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 },
++ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
++ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 },
++ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64,
++ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
++ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
++ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 },
++ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57,
++ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 },
++ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
++ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 },
++ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50,
++ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
++ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
++ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 },
++ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43,
++ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 },
++ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
++ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 },
++ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36,
++ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
++ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
++ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 },
++ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25,
++ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 },
++ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
++ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 },
++ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18,
++ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
++ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
++ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 },
++ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9,
++ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 },
++ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90,
++ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 },
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
++ { -2, 58, 10, -2},
++ { -4, 54, 16, -2},
++ { -6, 46, 28, -4},
++ { -4, 36, 36, -4},
++ { -4, 28, 46, -6},
++ { -2, 16, 54, -4},
++ { -2, 10, 58, -2},
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
++ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0},
++ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1},
++ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1}
++};
++
++#define BIT_DEPTH 8
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ int in_inc0, int in_inc1)
++{
++ int shift = 32;
++ uint32_t bs = 0;
++ for (; pus > 0; pus--) {
++ int strength, out;
++ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
++ int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
++ int nr_idx0 = neigh->ref_idx[0];
++ int nr_idx1 = neigh->ref_idx[1];
++ int neigh_refL0 = neigh_rpl0[nr_idx0];
++ int neigh_refL1 = neigh_rpl1[nr_idx1];
++
++ av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
++ av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
++
++#if 1 // This more directly matches the original implementation
++ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
++ // same L0 and L1
++ if (curr_refL0 == neigh_refL0 &&
++ curr_refL0 == curr_refL1 &&
++ neigh_refL0 == neigh_refL1) {
++ if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
++ (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
++ strength = 1;
++ else
++ strength = 0;
++ } else if (neigh_refL0 == curr_refL0 &&
++ neigh_refL1 == curr_refL1) {
++ if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else if (neigh_refL1 == curr_refL0 &&
++ neigh_refL0 == curr_refL1) {
++ if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else {
++ strength = 1;
++ }
++ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++ MvXY curr_mv0, neigh_mv0;
++
++ if (curr->pred_flag & 1) {
++ curr_mv0 = curr->xy[0];
++ } else {
++ curr_mv0 = curr->xy[1];
++ curr_refL0 = curr_refL1;
++ }
++
++ if (neigh->pred_flag & 1) {
++ neigh_mv0 = neigh->xy[0];
++ } else {
++ neigh_mv0 = neigh->xy[1];
++ neigh_refL0 = neigh_refL1;
++ }
++
++ if (curr_refL0 == neigh_refL0) {
++ if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else
++ strength = 1;
++ } else
++ strength = 1;
++#else // This has exactly the same effect, but is more suitable for vectorisation
++ MvXY curr_mv[2];
++ MvXY neigh_mv[2];
++ memcpy(curr_mv, curr->xy, sizeof curr_mv);
++ memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
++
++ if (!(curr->pred_flag & 2)) {
++ curr_mv[1] = curr_mv[0];
++ curr_refL1 = curr_refL0;
++ }
++ if (!(neigh->pred_flag & 2)) {
++ neigh_mv[1] = neigh_mv[0];
++ neigh_refL1 = neigh_refL0;
++ }
++ if (!(curr->pred_flag & 1)) {
++ curr_mv[0] = curr_mv[1];
++ curr_refL0 = curr_refL1;
++ }
++ if (!(neigh->pred_flag & 1)) {
++ neigh_mv[0] = neigh_mv[1];
++ neigh_refL0 = neigh_refL1;
++ }
++
++ strength = 1;
++
++ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
++
++ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
++
++ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#endif
++
++ curr += in_inc0 / sizeof (HEVCRpiMvField);
++ neigh += in_inc1 / sizeof (HEVCRpiMvField);
++
++ for (out = dup; out > 0; out--)
++ {
++ bs = (bs >> 2) | (strength << 30);
++ shift -= 2;
++ }
++ }
++ return bs >> shift;
++}
++
++
++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
++{
++ unsigned int i, j;
++
++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
++ for (i = 0; i < height; i++) {
++ for (j = 0; j < width; j+=8)
++ AV_COPY64U(dst+j, src+j);
++ dst += stride_dst;
++ src += stride_src;
++ }
++ } else {
++ for (i = 0; i < height; i++) {
++ for (j = 0; j < width; j+=16)
++ AV_COPY128(dst+j, src+j);
++ dst += stride_dst;
++ src += stride_src;
++ }
++ }
++}
++
++
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef PEL_FUNC
++#define PEL_FUNC(dst1, idx1, idx2, a, depth) \
++ for(i = 0 ; i < 10 ; i++) \
++{ \
++ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \
++}
++
++#undef EPEL_FUNCS
++#define EPEL_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \
++ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \
++ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \
++ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
++
++#undef EPEL_UNI_FUNCS
++#define EPEL_UNI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
++
++#undef EPEL_BI_FUNCS
++#define EPEL_BI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
++
++#undef QPEL_FUNCS
++#define QPEL_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \
++ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \
++ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
++
++#undef QPEL_UNI_FUNCS
++#define QPEL_UNI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
++
++#undef QPEL_BI_FUNCS
++#define QPEL_BI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++
++#define SLICED_ADD_RESIDUAL(depth)\
++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \
++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \
++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \
++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \
++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \
++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \
++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \
++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \
++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \
++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \
++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \
++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \
++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \
++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++ hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \
++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++ for (i = 0; i != SAO_FILTER_N; ++i) { \
++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \
++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \
++ } \
++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \
++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#define HEVC_DSP(depth) \
++ hevcdsp->put_pcm = FUNC(put_pcm, depth); \
++ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \
++ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \
++ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \
++ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \
++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \
++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \
++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \
++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \
++ SLICED_ADD_RESIDUAL(depth); \
++ hevcdsp->dequant = FUNC(dequant, depth); \
++ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \
++ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \
++ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \
++ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \
++ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \
++ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \
++ \
++ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \
++ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \
++ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \
++ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \
++ \
++ for (i = 0; i != SAO_FILTER_N; ++i) { \
++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \
++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \
++ } \
++ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \
++ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \
++ SLICED_SAO(depth); \
++ \
++ QPEL_FUNCS(depth); \
++ QPEL_UNI_FUNCS(depth); \
++ QPEL_BI_FUNCS(depth); \
++ EPEL_FUNCS(depth); \
++ EPEL_UNI_FUNCS(depth); \
++ EPEL_BI_FUNCS(depth); \
++ \
++ SLICED_LOOP_FILTERS(depth); \
++ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \
++ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \
++ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \
++ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \
++ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \
++ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \
++ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
++ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
++int i = 0;
++
++ switch (bit_depth) {
++ case 9:
++ HEVC_DSP(9);
++ break;
++ case 10:
++ HEVC_DSP(10);
++ break;
++ case 12:
++ HEVC_DSP(12);
++ break;
++ default:
++ HEVC_DSP(8);
++ break;
++ }
++
++ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++ hevcdsp->cpy_blk = cpy_blk;
++
++ if (ARCH_PPC)
++ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
++ if (ARCH_X86)
++ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
++ if (ARCH_ARM)
++ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
++ if (ARCH_MIPS)
++ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.h
+@@ -0,0 +1,177 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDSP_H
++#define AVCODEC_RPI_HEVCDSP_H
++
++#include "hevc.h"
++#include "get_bits.h"
++
++struct HEVCRpiMvField;
++
++#define MAX_PB_SIZE 64
++
++#define RPI_HEVC_SAO_BUF_STRIDE 160
++
++
++typedef struct RpiSAOParams {
++ uint8_t band_position[3]; ///< sao_band_position (Y,U,V)
++ uint8_t eo_class[3]; ///< sao_eo_class (Y,U=V)
++ uint8_t type_idx[3]; ///< sao_type_idx (Y,U=V)
++
++ int16_t offset_val[3][5]; ///<SaoOffsetVal (Y,U,V)
++
++} RpiSAOParams;
++
++
++// This controls how many sao dsp functions there are
++// N=5 has width = 8, 16, 32, 48, 64
++// N=6 adds a function for width=24 (in fn array el 5 so existing code should
++// still work)
++#define SAO_FILTER_N 6
++
++
++typedef struct HEVCDSPContext {
++ void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++ struct GetBitContext *gb, int pcm_bit_depth);
++
++ void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++ void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
++ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
++
++ void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++ void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
++ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++ struct GetBitContext *gb, int pcm_bit_depth);
++
++ void (*dequant)(int16_t *coeffs, int16_t log2_size);
++
++ void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
++
++ void (*transform_4x4_luma)(int16_t *coeffs);
++
++ void (*idct[4])(int16_t *coeffs, int col_limit);
++
++ void (*idct_dc[4])(int16_t *coeffs);
++
++ void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++ void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++
++ /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++ void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++ int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++ void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++
++ void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++ void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++
++ void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++
++ void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++
++ void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int ox0, int wx1,
++ int ox1, intptr_t mx, intptr_t my, int width);
++
++ void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q);
++ void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q);
++ void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++ void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l);
++ void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++ unsigned int no_f);
++ void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f);
++
++ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ int in_inc0, int inc_inc1);
++
++ void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
++} HEVCDSPContext;
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++
++extern const int8_t ff_hevc_rpi_epel_filters[7][4];
++extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
++
++void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
++void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
++#endif /* AVCODEC_RPI_HEVCDSP_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp_template.c
+@@ -0,0 +1,2279 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "get_bits.h"
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++#include "rpi_hevcdsp.h"
++
++#include "rpi_hevc_shader_template.h"
++
++static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++ GetBitContext *gb, int pcm_bit_depth)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++ dst += stride;
++ }
++}
++
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++ GetBitContext *gb, int pcm_bit_depth)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++ dst += stride;
++ }
++
++ dst = (pixel *)_dst + 1;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ dst[x] = av_clip_pixel(dst[x] + *res);
++ res++;
++ }
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ dst[x] = av_clip_pixel(dst[x] + dc);
++ }
++ dst += stride;
++ }
++}
++
++
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++ ptrdiff_t stride, const int dc_v, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x] = av_clip_pixel(dst[x] + *res);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++ res++;
++ }
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++ ptrdiff_t stride, const int dc_u, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x] = av_clip_pixel(dst[x] + dc_u);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
++ res++;
++ }
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++ ptrdiff_t stride, unsigned int size)
++{
++ unsigned int x, y;
++ pixel *dst = (pixel *)_dst;
++ const int16_t * ru = res;
++ const int16_t * rv = res + size * size;
++
++// rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++// rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++// rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++ }
++ dst += stride;
++ }
++
++// rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
++}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ const int dc_v = dc >> 16;
++ const int dc_u = (dc << 16) >> 16;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x] = av_clip_pixel(dst[x] + dc_u);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++ }
++ dst += stride;
++ }
++}
++
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 32);
++}
++
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++
++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++{
++ int16_t *coeffs = (int16_t *) _coeffs;
++ int x, y;
++ int size = 1 << log2_size;
++
++ if (mode) {
++ coeffs += size;
++ for (y = 0; y < size - 1; y++) {
++ for (x = 0; x < size; x++)
++ coeffs[x] += coeffs[x - size];
++ coeffs += size;
++ }
++ } else {
++ for (y = 0; y < size; y++) {
++ for (x = 1; x < size; x++)
++ coeffs[x] += coeffs[x - 1];
++ coeffs += size;
++ }
++ }
++}
++
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
++{
++ int shift = 15 - BIT_DEPTH - log2_size;
++ int x, y;
++ int size = 1 << log2_size;
++
++ if (shift > 0) {
++ int offset = 1 << (shift - 1);
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ *coeffs = (*coeffs + offset) >> shift;
++ coeffs++;
++ }
++ }
++ } else {
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ *coeffs = *coeffs << -shift;
++ coeffs++;
++ }
++ }
++ }
++}
++
++#define SET(dst, x) (dst) = (x)
++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
++
++#define TR_4x4_LUMA(dst, src, step, assign) \
++ do { \
++ int c0 = src[0 * step] + src[2 * step]; \
++ int c1 = src[2 * step] + src[3 * step]; \
++ int c2 = src[0 * step] - src[3 * step]; \
++ int c3 = 74 * src[1 * step]; \
++ \
++ assign(dst[2 * step], 74 * (src[0 * step] - \
++ src[2 * step] + \
++ src[3 * step])); \
++ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \
++ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \
++ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \
++ } while (0)
++
++static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++{
++ int i;
++ int shift = 7;
++ int add = 1 << (shift - 1);
++ int16_t *src = coeffs;
++
++ for (i = 0; i < 4; i++) {
++ TR_4x4_LUMA(src, src, 4, SCALE);
++ src++;
++ }
++
++ shift = 20 - BIT_DEPTH;
++ add = 1 << (shift - 1);
++ for (i = 0; i < 4; i++) {
++ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
++ coeffs += 4;
++ }
++}
++
++#undef TR_4x4_LUMA
++
++#define TR_4(dst, src, dstep, sstep, assign, end) \
++ do { \
++ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
++ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
++ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
++ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
++ \
++ assign(dst[0 * dstep], e0 + o0); \
++ assign(dst[1 * dstep], e1 + o1); \
++ assign(dst[2 * dstep], e1 - o1); \
++ assign(dst[3 * dstep], e0 - o0); \
++ } while (0)
++
++#define TR_8(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_8[4]; \
++ int o_8[4] = { 0 }; \
++ for (i = 0; i < 4; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_8[i] += transform[4 * j][i] * src[j * sstep]; \
++ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \
++ \
++ for (i = 0; i < 4; i++) { \
++ assign(dst[i * dstep], e_8[i] + o_8[i]); \
++ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \
++ } \
++ } while (0)
++
++#define TR_16(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_16[8]; \
++ int o_16[8] = { 0 }; \
++ for (i = 0; i < 8; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_16[i] += transform[2 * j][i] * src[j * sstep]; \
++ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \
++ \
++ for (i = 0; i < 8; i++) { \
++ assign(dst[i * dstep], e_16[i] + o_16[i]); \
++ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \
++ } \
++ } while (0)
++
++#define TR_32(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_32[16]; \
++ int o_32[16] = { 0 }; \
++ for (i = 0; i < 16; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_32[i] += transform[j][i] * src[j * sstep]; \
++ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \
++ \
++ for (i = 0; i < 16; i++) { \
++ assign(dst[i * dstep], e_32[i] + o_32[i]); \
++ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \
++ } \
++ } while (0)
++
++#define IDCT_VAR4(H) \
++ int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR8(H) \
++ int limit = FFMIN(col_limit, H); \
++ int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR16(H) IDCT_VAR8(H)
++#define IDCT_VAR32(H) IDCT_VAR8(H)
++
++#define IDCT(H) \
++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \
++ int col_limit) \
++{ \
++ int i; \
++ int shift = 7; \
++ int add = 1 << (shift - 1); \
++ int16_t *src = coeffs; \
++ IDCT_VAR ## H(H); \
++ \
++ for (i = 0; i < H; i++) { \
++ TR_ ## H(src, src, H, H, SCALE, limit2); \
++ if (limit2 < H && i%4 == 0 && !!i) \
++ limit2 -= 4; \
++ src++; \
++ } \
++ \
++ shift = 20 - BIT_DEPTH; \
++ add = 1 << (shift - 1); \
++ for (i = 0; i < H; i++) { \
++ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \
++ coeffs += H; \
++ } \
++}
++
++#define IDCT_DC(H) \
++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \
++{ \
++ int i, j; \
++ int shift = 14 - BIT_DEPTH; \
++ int add = 1 << (shift - 1); \
++ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \
++ \
++ for (j = 0; j < H; j++) { \
++ for (i = 0; i < H; i++) { \
++ coeffs[i + j * H] = coeff; \
++ } \
++ } \
++}
++
++IDCT( 4)
++IDCT( 8)
++IDCT(16)
++IDCT(32)
++
++IDCT_DC( 4)
++IDCT_DC( 8)
++IDCT_DC(16)
++IDCT_DC(32)
++
++#undef TR_4
++#undef TR_8
++#undef TR_16
++#undef TR_32
++
++#undef SET
++#undef SCALE
++
++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class,
++ int width, int height)
++{
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int offset_table[32] = { 0 };
++ int k, y, x;
++ int shift = BIT_DEPTH - 5;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ for (k = 0; k < 4; k++)
++ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
++ dst += stride_dst;
++ src += stride_src;
++ }
++}
++
++#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
++
++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
++ int eo, int width, int height) {
++
++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++ static const int8_t pos[4][2][2] = {
++ { { -1, 0 }, { 1, 0 } }, // horizontal
++ { { 0, -1 }, { 0, 1 } }, // vertical
++ { { -1, -1 }, { 1, 1 } }, // 45 degree
++ { { 1, -1 }, { -1, 1 } }, // 135 degree
++ };
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int a_stride, b_stride;
++ int x, y;
++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++ stride_dst /= sizeof(pixel);
++
++ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
++ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ int diff0 = CMP(src[x], src[x + a_stride]);
++ int diff1 = CMP(src[x], src[x + b_stride]);
++ int offset_val = edge_idx[2 + diff0 + diff1];
++ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
++ }
++ src += stride_src;
++ dst += stride_dst;
++ }
++}
++
++
++#if BIT_DEPTH == 10
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
++#endif
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++ int *borders, int _width, int _height,
++ int c_idx, uint8_t *vert_edge,
++ uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int sao_eo_class = sao->eo_class[c_idx];
++ int init_x = 0, width = _width, height = _height;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ if (sao_eo_class != SAO_EO_VERT) {
++ if (borders[0]) {
++ for (y = 0; y < height; y++) {
++ dst[y * stride_dst] = src[y * stride_src];
++ }
++ init_x = 1;
++ }
++ if (borders[2]) {
++ int offset = width - 1;
++ for (x = 0; x < height; x++) {
++ dst[x * stride_dst + offset] = src[x * stride_src + offset];
++ }
++ width--;
++ }
++ }
++ if (sao_eo_class != SAO_EO_HORIZ) {
++ if (borders[1]) {
++ for (x = init_x; x < width; x++)
++ dst[x] = src[x];
++ }
++ if (borders[3]) {
++ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++ ptrdiff_t y_stride_src = stride_src * (height - 1);
++ for (x = init_x; x < width; x++)
++ dst[x + y_stride_dst] = src[x + y_stride_src];
++ height--;
++ }
++ }
++}
++
++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++ int *borders, int _width, int _height,
++ int c_idx, uint8_t *vert_edge,
++ uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int sao_eo_class = sao->eo_class[c_idx];
++ int init_x = 0, init_y = 0, width = _width, height = _height;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ if (sao_eo_class != SAO_EO_VERT) {
++ if (borders[0]) {
++ for (y = 0; y < height; y++) {
++ dst[y * stride_dst] = src[y * stride_src];
++ }
++ init_x = 1;
++ }
++ if (borders[2]) {
++ int offset = width - 1;
++ for (x = 0; x < height; x++) {
++ dst[x * stride_dst + offset] = src[x * stride_src + offset];
++ }
++ width--;
++ }
++ }
++ if (sao_eo_class != SAO_EO_HORIZ) {
++ if (borders[1]) {
++ for (x = init_x; x < width; x++)
++ dst[x] = src[x];
++ init_y = 1;
++ }
++ if (borders[3]) {
++ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++ ptrdiff_t y_stride_src = stride_src * (height - 1);
++ for (x = init_x; x < width; x++)
++ dst[x + y_stride_dst] = src[x + y_stride_src];
++ height--;
++ }
++ }
++
++ {
++ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
++ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2];
++ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
++ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3];
++
++ // Restore pixels that can't be modified
++ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
++ for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
++ dst[y*stride_dst] = src[y*stride_src];
++ }
++ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
++ for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
++ dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
++ }
++
++ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
++ for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
++ dst[x] = src[x];
++ }
++ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
++ for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
++ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
++ }
++ if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
++ dst[0] = src[0];
++ if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
++ dst[width-1] = src[width-1];
++ if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
++ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
++ if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
++ dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
++
++ }
++}
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
++
++// --- Plaited chroma versions
++
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height)
++{
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int offset_table_u[32] = { 0 };
++ int offset_table_v[32] = { 0 };
++ int k, y, x;
++ int shift = BIT_DEPTH - 5;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++ width *= 2;
++
++ for (k = 0; k < 4; k++)
++ {
++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++ }
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x += 2)
++ {
++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++ // *** & 31 shouldn't be wanted but just now we generate broken input that
++ // crashes us in 10-bit world
++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
++ }
++ dst += stride_dst;
++ src += stride_src;
++ }
++}
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++ int eo, int width, int height) {
++
++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++ static const int8_t pos[4][2][2] = {
++ { { -1, 0 }, { 1, 0 } }, // horizontal
++ { { 0, -1 }, { 0, 1 } }, // vertical
++ { { -1, -1 }, { 1, 1 } }, // 45 degree
++ { { 1, -1 }, { -1, 1 } }, // 135 degree
++ };
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int a_stride, b_stride;
++ int x, y;
++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++
++ stride_dst /= sizeof(pixel);
++ width *= 2;
++
++ av_assert0(width <= 64);
++
++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x += 2) {
++ int diff0u = CMP(src[x], src[x + a_stride]);
++ int diff1u = CMP(src[x], src[x + b_stride]);
++ int offset_valu = edge_idx[2 + diff0u + diff1u];
++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++ int offset_valv = edge_idx[2 + diff0v + diff1v];
++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++ }
++ src += stride_src;
++ dst += stride_dst;
++ }
++}
++
++// Do once
++#if BIT_DEPTH == 8
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
++#endif
++
++#undef CMP
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = src[x] << (14 - BIT_DEPTH);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ memcpy(dst, src, width * sizeof(pixel));
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
++ }
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define QPEL_FILTER(src, stride) \
++ (filter[0] * src[x - 3 * stride] + \
++ filter[1] * src[x - 2 * stride] + \
++ filter[2] * src[x - stride] + \
++ filter[3] * src[x ] + \
++ filter[4] * src[x + stride] + \
++ filter[5] * src[x + 2 * stride] + \
++ filter[6] * src[x + 3 * stride] + \
++ filter[7] * src[x + 4 * stride])
++
++static void FUNC(put_hevc_qpel_h)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_v)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
++ uint8_t *_src,
++ ptrdiff_t _srcstride,
++ int height, intptr_t mx,
++ intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++ tmp += MAX_PB_SIZE;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++
++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define EPEL_FILTER(src, stride) \
++ (filter[0] * src[x - stride] + \
++ filter[1] * src[x] + \
++ filter[2] * src[x + stride] + \
++ filter[3] * src[x + 2 * stride])
++
++static void FUNC(put_hevc_epel_h)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_v)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_hv)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++ tmp += MAX_PB_SIZE;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ }
++ dst += dststride;
++ src += srcstride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ dst += dststride;
++ src += srcstride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ }
++ dst += dststride;
++ src += srcstride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ }
++ dst += dststride;
++ src += srcstride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++// line zero
++#define P3 pix[-4 * xstride]
++#define P2 pix[-3 * xstride]
++#define P1 pix[-2 * xstride]
++#define P0 pix[-1 * xstride]
++#define Q0 pix[0 * xstride]
++#define Q1 pix[1 * xstride]
++#define Q2 pix[2 * xstride]
++#define Q3 pix[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix[-4 * xstride + 3 * ystride]
++#define TP2 pix[-3 * xstride + 3 * ystride]
++#define TP1 pix[-2 * xstride + 3 * ystride]
++#define TP0 pix[-1 * xstride + 3 * ystride]
++#define TQ0 pix[0 * xstride + 3 * ystride]
++#define TQ1 pix[1 * xstride + 3 * ystride]
++#define TQ2 pix[2 * xstride + 3 * ystride]
++#define TQ3 pix[3 * xstride + 3 * ystride]
++
++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
++ ptrdiff_t _xstride, ptrdiff_t _ystride,
++ int beta, int *_tc,
++ uint8_t *_no_p, uint8_t *_no_q)
++{
++ int d, j;
++ pixel *pix = (pixel *)_pix;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++ beta <<= BIT_DEPTH - 8;
++
++ for (j = 0; j < 2; j++) {
++ const int dp0 = abs(P2 - 2 * P1 + P0);
++ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
++ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
++ const int d0 = dp0 + dq0;
++ const int d3 = dp3 + dq3;
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ const int no_p = _no_p[j];
++ const int no_q = _no_q[j];
++
++ if (d0 + d3 >= beta) {
++ pix += 4 * ystride;
++ continue;
++ } else {
++ const int beta_3 = beta >> 3;
++ const int beta_2 = beta >> 2;
++ const int tc25 = ((tc * 5 + 1) >> 1);
++
++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
++ // strong filtering
++ const int tc2 = tc << 1;
++ for (d = 0; d < 4; d++) {
++ const int p3 = P3;
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ const int q3 = Q3;
++ if (!no_p) {
++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++ }
++ if (!no_q) {
++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++ }
++ pix += ystride;
++ }
++ } else { // normal filtering
++ int nd_p = 1;
++ int nd_q = 1;
++ const int tc_2 = tc >> 1;
++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++ nd_p = 2;
++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++ nd_q = 2;
++
++ for (d = 0; d < 4; d++) {
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++ if (abs(delta0) < 10 * tc) {
++ delta0 = av_clip(delta0, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ if (!no_p && nd_p > 1) {
++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++ P1 = av_clip_pixel(p1 + deltap1);
++ }
++ if (!no_q && nd_q > 1) {
++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++ Q1 = av_clip_pixel(q1 + deltaq1);
++ }
++ }
++ pix += ystride;
++ }
++ }
++ }
++ }
++}
++
++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
++ ptrdiff_t _ystride, int *_tc,
++ uint8_t *_no_p, uint8_t *_no_q)
++{
++ int d, j, no_p, no_q;
++ pixel *pix = (pixel *)_pix;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++ for (j = 0; j < 2; j++) {
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ if (tc <= 0) {
++ pix += 4 * ystride;
++ continue;
++ }
++ no_p = _no_p[j];
++ no_q = _no_q[j];
++
++ for (d = 0; d < 4; d++) {
++ int delta0;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ pix += ystride;
++ }
++ }
++}
++
++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
++ beta, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
++ beta, tc, no_p, no_q);
++}
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
++
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l)
++{
++ int d, j;
++ pixel *pix_l = (pixel *)_pix_l;
++ pixel *pix_r = (pixel *)_pix_r;
++ const ptrdiff_t xstride = 1;
++ const ptrdiff_t ystride = _stride / sizeof(pixel);
++
++ beta <<= BIT_DEPTH - 8;
++
++ for (j = 0; j < 2; j++) {
++ const int dp0 = abs(P2 - 2 * P1 + P0);
++ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
++ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
++ const int d0 = dp0 + dq0;
++ const int d3 = dp3 + dq3;
++ const int tc = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
++ const int no_p = no_f & 1;
++ const int no_q = no_f & 2;
++
++ if (d0 + d3 >= beta) {
++ pix_l += 4 * ystride;
++ pix_r += 4 * ystride;
++ continue;
++ } else {
++ const int beta_3 = beta >> 3;
++ const int beta_2 = beta >> 2;
++ const int tc25 = ((tc * 5 + 1) >> 1);
++
++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
++ // strong filtering
++ const int tc2 = tc << 1;
++ for (d = 0; d < 4; d++) {
++ const int p3 = P3;
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ const int q3 = Q3;
++ if (!no_p) {
++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++ }
++ if (!no_q) {
++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++ }
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ } else { // normal filtering
++ int nd_p = 1;
++ int nd_q = 1;
++ const int tc_2 = tc >> 1;
++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++ nd_p = 2;
++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++ nd_q = 2;
++
++ for (d = 0; d < 4; d++) {
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++ if (abs(delta0) < 10 * tc) {
++ delta0 = av_clip(delta0, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ if (!no_p && nd_p > 1) {
++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++ P1 = av_clip_pixel(p1 + deltap1);
++ }
++ if (!no_q && nd_q > 1) {
++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++ Q1 = av_clip_pixel(q1 + deltaq1);
++ }
++ }
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ }
++ }
++ }
++}
++
++static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
++{
++ // Just call the non-2 function having massaged the parameters
++ int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
++ uint8_t no_p[2] = {no_f & 1, no_f & 1};
++ uint8_t no_q[2] = {no_f & 2, no_f & 2};
++ FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
++}
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++ ptrdiff_t _ystride, const int32_t *_tc,
++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++ int d, j, no_p, no_q;
++ pixel *pix_l = (pixel *)_pix_l;
++ pixel *pix_r = (pixel *)_pix_r;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++ for (j = 0; j < 2; j++) {
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ if (tc <= 0) {
++ pix_l += 4 * ystride;
++ pix_r += 4 * ystride;
++ continue;
++ }
++ no_p = _no_p[j];
++ no_q = _no_q[j];
++
++ for (d = 0; d < 4; d++) {
++ int delta0;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ }
++}
++
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++ unsigned int no_f)
++{
++ uint8_t no_p[2] = {no_f & 1, no_f & 2};
++ uint8_t no_q[2] = {no_f & 4, no_f & 8};
++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f)
++{
++ uint8_t no_p[2] = {no_f & 1, no_f & 2};
++ uint8_t no_q[2] = {no_f & 4, no_f & 8};
++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
++
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.c
+@@ -0,0 +1,161 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdec.h"
++
++#include "rpi_hevcpred.h"
++#if (ARCH_ARM)
++#include "arm/rpi_hevcpred_arm.h"
++#endif
++
++#define PRED_C 0
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
++
++#define HEVC_PRED_Y(depth) \
++ hpc->intra_pred = FUNC(intra_pred, depth); \
++ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
++ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
++ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
++ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
++ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \
++ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \
++ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \
++ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \
++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \
++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \
++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \
++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \
++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \
++ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \
++ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \
++ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth);
++
++#define HEVC_PRED_C(depth) \
++ hpc->intra_pred_c = FUNCC(intra_pred, depth); \
++ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
++ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
++ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
++ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \
++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \
++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \
++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \
++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \
++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \
++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \
++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \
++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \
++ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \
++ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \
++ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth);
++
++#define HEVC_PRED(depth) \
++ HEVC_PRED_Y(depth); \
++ HEVC_PRED_C(depth);
++
++ switch (bit_depth) {
++ case 9:
++ HEVC_PRED(9);
++ break;
++ case 10:
++ HEVC_PRED(10);
++ break;
++ case 12:
++ HEVC_PRED(12);
++ break;
++ default:
++ HEVC_PRED(8);
++ break;
++ }
++
++#if (ARCH_ARM)
++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
++#elif (ARCH_MIPS)
++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#endif
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.h
+@@ -0,0 +1,123 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCPRED_H
++#define AVCODEC_RPI_HEVCPRED_H
++
++#include <stddef.h>
++#include <stdint.h>
++#include "config.h"
++
++struct HEVCRpiContext;
++struct HEVCRpiLocalContext;
++
++enum IntraPredMode {
++ INTRA_PLANAR = 0,
++ INTRA_DC,
++ INTRA_ANGULAR_2,
++ INTRA_ANGULAR_3,
++ INTRA_ANGULAR_4,
++ INTRA_ANGULAR_5,
++ INTRA_ANGULAR_6,
++ INTRA_ANGULAR_7,
++ INTRA_ANGULAR_8,
++ INTRA_ANGULAR_9,
++ INTRA_ANGULAR_10,
++ INTRA_ANGULAR_11,
++ INTRA_ANGULAR_12,
++ INTRA_ANGULAR_13,
++ INTRA_ANGULAR_14,
++ INTRA_ANGULAR_15,
++ INTRA_ANGULAR_16,
++ INTRA_ANGULAR_17,
++ INTRA_ANGULAR_18,
++ INTRA_ANGULAR_19,
++ INTRA_ANGULAR_20,
++ INTRA_ANGULAR_21,
++ INTRA_ANGULAR_22,
++ INTRA_ANGULAR_23,
++ INTRA_ANGULAR_24,
++ INTRA_ANGULAR_25,
++ INTRA_ANGULAR_26,
++ INTRA_ANGULAR_27,
++ INTRA_ANGULAR_28,
++ INTRA_ANGULAR_29,
++ INTRA_ANGULAR_30,
++ INTRA_ANGULAR_31,
++ INTRA_ANGULAR_32,
++ INTRA_ANGULAR_33,
++ INTRA_ANGULAR_34,
++};
++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26
++
++typedef void intra_filter_fn_t(
++ uint8_t * const left, uint8_t * const top,
++ const unsigned int req, const unsigned int avail,
++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
++ const unsigned int stride,
++ const unsigned int top_right_size, const unsigned int down_left_size);
++
++typedef struct HEVCRpiPredContext {
++ void (*intra_pred)(const struct HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++ const unsigned int avail, const unsigned int log2_size);
++
++ intra_filter_fn_t *intra_filter[4];
++ void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride);
++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
++ void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
++
++ void (*intra_pred_c)(const struct HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++ const unsigned int avail, const unsigned int log2_size);
++ intra_filter_fn_t *intra_filter_c[4];
++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride);
++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
++} HEVCRpiPredContext;
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
++
++#endif /* AVCODEC_RPI_HEVCPRED_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred_template.c
+@@ -0,0 +1,1407 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "bit_depth_template.c"
++
++#include "rpi_hevcdec.h"
++#include "rpi_hevcpred.h"
++
++#define DUMP_PRED 0
++
++#define POS(x, y) src[(x) + stride * (y)]
++
++// INCLUDED_ONCE defined at EOF
++#ifndef INCLUDED_ONCE
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
++
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++ uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++ uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++ pixel4_16 t = {{x, x, x, x}};
++ return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++ pixel4_32 t = {{x, x, x, x}};
++ return t;
++}
++#endif
++
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
++
++#if BIT_DEPTH == 8
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t c8_src_ptr_t
++#define c_dst_ptr_t c8_dst_ptr_t
++#else
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
++#endif
++
++
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
++#endif
++
++
++#if DUMP_PRED && !defined(INCLUDED_ONCE)
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++ for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++ for (unsigned int x = 0; x != size; x++) {
++ printf("%4d", data[x * 2]);
++ }
++ printf("\n");
++ }
++ printf("\n");
++}
++#endif
++
++#ifndef INCLUDED_ONCE
++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t v4 = v | (v << 8);
++ uint32_t * p = (uint32_t *)ptr;
++ v4 = v4 | (v4 << 16);
++ do {
++ *p++ = v4;
++ } while (--n != 0);
++ }
++}
++
++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t v2 = v | (v << 16);
++ uint32_t * p = (uint32_t *)ptr;
++ do {
++ *p++ = v2;
++ *p++ = v2;
++ } while (--n != 0);
++ }
++}
++
++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t * p = (uint32_t *)ptr;
++ do {
++ *p++ = v;
++ *p++ = v;
++ *p++ = v;
++ *p++ = v;
++ } while (--n != 0);
++ }
++}
++
++// Beware that this inverts the avail ordering
++// For CIP it seems easier this way round
++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++ unsigned int s0, unsigned int odd_s)
++{
++ const unsigned int n = 1 << log2_intra_bits;
++ unsigned int fa = 0;
++ unsigned int i;
++
++ size >>= 2; // Now in 4-pel units
++ s0 >>= 2;
++
++ if ((avail & AVAIL_DL) != 0)
++ fa |= ((1 << s0) - 1) << (size - s0);
++ if ((avail & AVAIL_L) != 0)
++ fa |= ((1 << size) - 1) << size;
++ if ((avail & AVAIL_UL) != 0)
++ fa |= 1 << (size << 1);
++
++ if (odd_s) {
++ if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
++ fa &= ~1;
++ is_intra += i_stride;
++ }
++
++ for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
++ const unsigned int m = ((1 << n) - 1) << i;
++ if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
++ fa &= ~m;
++ }
++
++ return fa;
++}
++
++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++ unsigned int s1, unsigned int odd_s)
++{
++ if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
++ {
++ return 0;
++ }
++ else
++ {
++ const unsigned int n = 1 << log2_intra_bits;
++ unsigned int fa = 0;
++ unsigned int i;
++ unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
++
++ size >>= 2; // Now in 4-pel units
++ s1 >>= 2;
++
++ if ((avail & AVAIL_U) != 0)
++ fa |= ((1 << size) - 1);
++ if ((avail & AVAIL_UR) != 0)
++ fa |= ((1 << s1) - 1) << size;
++
++ if (odd_s) {
++ fa &= im | ~1;
++ im >>= 1;
++ }
++
++ for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
++ const unsigned int m = ((1 << n) - 1) << i;
++ if ((im & 1) == 0)
++ fa &= ~m;
++ }
++ return fa;
++ }
++}
++
++
++
++static inline unsigned int rmbd(unsigned int x)
++{
++#if 1
++ return __builtin_ctz(x);
++#else
++ unsigned int n = 0;
++ if ((x & 0xffff) == 0) {
++ x >>= 16;
++ n += 16;
++ }
++ if ((x & 0xff) == 0) {
++ x >>= 8;
++ n += 8;
++ }
++ if ((x & 0xf) == 0) {
++ x >>= 4;
++ n += 4;
++ }
++ if ((x & 0x3) == 0) {
++ x >>= 2;
++ n += 2;
++ }
++
++ return (x & 1) == 0 ? n + 1 : n;
++#endif
++}
++#endif
++
++
++static void FUNC(cip_fill)(pixel * const left, pixel * const top,
++ const unsigned int avail_l, const unsigned int avail_u,
++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++ const unsigned int stride,
++ const unsigned int size)
++{
++ pixel a;
++ unsigned int i;
++
++ // 1st find DL value
++ if ((avail_l & 1) == 0) {
++ if (avail_l != 0)
++ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
++ else
++ {
++ // (avail_l | avail_u) != 0 so this must be good
++ const unsigned int n = rmbd(avail_u)*4;
++ a = (n >= size) ? src_ur[n - size] : src_u[n];
++ }
++ }
++
++ // L
++ {
++ pixel * d = left + size * 2 - 1;
++ const pixel * s = src_l + (size * 2 - 1) * stride;
++ unsigned int x = avail_l;
++ for (i = 0; i < size * 2; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d-- = *s;
++ s -= stride;
++ *d-- = *s;
++ s -= stride;
++ *d-- = *s;
++ s -= stride;
++ *d-- = a = *s;
++ s -= stride;
++ }
++ else
++ {
++ *d-- = a;
++ *d-- = a;
++ *d-- = a;
++ *d-- = a;
++ s -= stride * 4;
++ }
++ }
++ // UL
++ *d = a = (x & 1) != 0 ? *s : a;
++ }
++
++ // U
++ {
++ pixel * d = top;
++ const pixel * s = src_u;
++ unsigned int x = avail_u;
++
++ for (i = 0; i < size; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = a = *s++;
++ }
++ else
++ {
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ s += 4;
++ }
++ }
++
++ // UR
++ s = src_ur;
++ for (i = 0; i < size; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = a = *s++;
++ }
++ else
++ {
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ s += 4;
++ }
++ }
++ }
++}
++
++
++#if !PRED_C && PW == 1
++#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
++#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
++#else
++#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
++#endif
++
++// Reqs:
++//
++// Planar: DL[0], L, ul, U, UR[0]
++// DC: dl, L, ul, U, ur
++// A2-9: DL, L, ul, u, ur
++// A10: dl, L, ul, u, ur
++// A11-17 dl, L, UL, U, ur
++// A18-25 dl, L, Ul, U, ur
++// A26 dl, l, ul, U, ur
++// A27-34 dl, l, ul, U, UR
++
++#ifndef INCLUDED_ONCE
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++
++static const uint8_t req_avail_c[35] =
++{
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L, // 2
++ AVAIL_DL | AVAIL_L, // 3
++ AVAIL_DL | AVAIL_L, // 4
++ AVAIL_DL | AVAIL_L, // 5
++ AVAIL_DL | AVAIL_L, // 6
++ AVAIL_DL | AVAIL_L, // 7
++ AVAIL_DL | AVAIL_L, // 8
++ AVAIL_DL | AVAIL_L, // 9
++ AVAIL_L, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
++ AVAIL_U, // 26 (V)
++ AVAIL_U | AVAIL_UR, // 27
++ AVAIL_U | AVAIL_UR, // 28
++ AVAIL_U | AVAIL_UR, // 29
++ AVAIL_U | AVAIL_UR, // 30
++ AVAIL_U | AVAIL_UR, // 31
++ AVAIL_U | AVAIL_UR, // 32
++ AVAIL_U | AVAIL_UR, // 33
++ AVAIL_U | AVAIL_UR // 34
++};
++
++static const uint8_t req_avail[4][35] = {
++{
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L, // 2
++ AVAIL_DL | AVAIL_L, // 3
++ AVAIL_DL | AVAIL_L, // 4
++ AVAIL_DL | AVAIL_L, // 5
++ AVAIL_DL | AVAIL_L, // 6
++ AVAIL_DL | AVAIL_L, // 7
++ AVAIL_DL | AVAIL_L, // 8
++ AVAIL_DL | AVAIL_L, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V)
++ AVAIL_U | AVAIL_UR, // 27
++ AVAIL_U | AVAIL_UR, // 28
++ AVAIL_U | AVAIL_UR, // 29
++ AVAIL_U | AVAIL_UR, // 30
++ AVAIL_U | AVAIL_UR, // 31
++ AVAIL_U | AVAIL_UR, // 32
++ AVAIL_U | AVAIL_UR, // 33
++ AVAIL_U | AVAIL_UR // 34
++},
++{ // 3
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
++ AVAIL_DL | AVAIL_L | 0, // 3
++ AVAIL_DL | AVAIL_L | 0, // 4
++ AVAIL_DL | AVAIL_L | 0, // 5
++ AVAIL_DL | AVAIL_L | 0, // 6
++ AVAIL_DL | AVAIL_L | 0, // 7
++ AVAIL_DL | AVAIL_L | 0, // 8
++ AVAIL_DL | AVAIL_L | 0, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | 0, // 27
++ AVAIL_U | AVAIL_UR | 0, // 28
++ AVAIL_U | AVAIL_UR | 0, // 29
++ AVAIL_U | AVAIL_UR | 0, // 30
++ AVAIL_U | AVAIL_UR | 0, // 31
++ AVAIL_U | AVAIL_UR | 0, // 32
++ AVAIL_U | AVAIL_UR | 0, // 33
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
++},
++{ // 4
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8
++ AVAIL_DL | AVAIL_L | 0, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | 0, // 27
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
++},
++{ // 5
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9
++ AVAIL_L | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25
++ AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
++ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34
++}
++};
++
++
++#endif
++
++#define filter_light1 FUNC(filter_light1)
++static inline pixel filter_light1(pixel a, pixel b, pixel c)
++{
++ return (a + b*2 + c + 2) >> 2;
++}
++
++#define filter_light FUNC(filter_light)
++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
++{
++ pixel p0;
++ pixel p2 = *src;
++ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
++ unsigned int n_minus_1 = n - 1;
++
++ do
++ {
++ src += sstride;
++ p0 = p1;
++ p1 = p2;
++ p2 = *src;
++ *dst++ = filter_light1(p0, p1, p2);
++ } while (--n_minus_1 != 0);
++ *dst = filter_light1(p1, p2, pn);
++}
++
++#define filter_strong FUNC(filter_strong)
++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
++{
++ unsigned int a = 64 * p0 + 32;
++ const int v = p1 - p0;
++
++ do
++ {
++ *dst++ = (a += v) >> 6;
++ } while (--n != 0);
++}
++
++#define intra_filter FUNC(intra_filter)
++static av_always_inline void intra_filter(
++ pixel * const left, pixel * const top,
++ const unsigned int req, const unsigned int avail,
++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++ const unsigned int stride,
++ const unsigned int top_right_size, const unsigned int down_left_size,
++ const unsigned int log2_size)
++{
++ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
++ const unsigned int size = 1 << log2_size;
++
++ // a_ is the first pel in a section working round dl -> ur
++ // b_ is the last
++ // Beware that top & left work out from UL so usage of a_ & b_ may
++ // swap between them. It is a bad naming scheme but I have found no
++ // better
++ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
++ const pixel * b_dl = src_l + size * stride;
++ const pixel * a_l = src_l + (size - 1) * stride;
++ const pixel * b_l = src_l;
++ const pixel * ab_ul = src_l - stride;
++ const pixel * a_u = src_u;
++ const pixel * b_u = src_u + size - 1;
++ const pixel * a_ur = src_ur;
++ const pixel * b_ur = src_ur + top_right_size - 1;
++
++ const unsigned int want = req & ~avail;
++ const unsigned int have = req & avail;
++ unsigned int i;
++
++ if ((avail & AVAIL_DL) == 0)
++ {
++ a_dl = a_ur;
++ if ((avail & AVAIL_U) != 0)
++ a_dl = a_u;
++ if ((avail & AVAIL_UL) != 0)
++ a_dl = ab_ul;
++ if ((avail & AVAIL_L) != 0)
++ a_dl = a_l;
++ b_dl = a_dl;
++ }
++
++ if ((avail & AVAIL_L) == 0)
++ {
++ a_l = b_dl;
++ b_l = b_dl;
++ }
++ if ((avail & AVAIL_UL) == 0)
++ {
++ ab_ul = b_l;
++ }
++ if ((avail & AVAIL_U) == 0)
++ {
++ a_u = ab_ul;
++ b_u = ab_ul;
++ }
++ if ((avail & AVAIL_UR) == 0)
++ {
++ a_ur = b_u;
++ b_ur = b_u;
++ }
++
++ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints
++ {
++ if ((req & AVAIL_UL) != 0)
++ left[-1] = *ab_ul;
++
++ if ((want & AVAIL_L) != 0)
++ EXTEND(left, *a_l, size);
++ if ((want & AVAIL_DL) != 0)
++ EXTEND(left + size, *a_dl, size);
++ if ((want & AVAIL_U) != 0)
++ EXTEND(top, *a_u, size);
++ if ((want & AVAIL_UR) != 0)
++ EXTEND(top + size, *a_ur, size);
++
++ if ((have & AVAIL_U) != 0)
++ // Always good - even with sand
++ memcpy(top, a_u, size * sizeof(pixel));
++ if ((have & AVAIL_UR) != 0)
++ {
++ memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
++ EXTEND(top + size + top_right_size, *b_ur,
++ size - top_right_size);
++ }
++ if ((have & AVAIL_L) != 0)
++ {
++ for (i = 0; i < size; i++)
++ left[i] = b_l[stride * i];
++ }
++ if ((have & AVAIL_DL) != 0)
++ {
++ for (i = 0; i < down_left_size; i++)
++ left[i + size] = b_dl[stride * i];
++ EXTEND(left + size + down_left_size, *a_dl,
++ size - down_left_size);
++ }
++ }
++ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
++ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
++ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
++ {
++ if ((req & (AVAIL_U | AVAIL_UR)) != 0)
++ filter_strong(top, *ab_ul, *b_ur, size * 2);
++ left[-1] = *ab_ul;
++ if ((req & (AVAIL_L | AVAIL_DL)) != 0)
++ filter_strong(left, *ab_ul, *a_dl, size*2);
++ }
++ else
++ {
++ // Same code for both have & want for UL
++ if ((req & AVAIL_UL) != 0)
++ {
++ left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
++ }
++
++ if ((want & AVAIL_L) != 0)
++ {
++ EXTEND(left, *a_l, size);
++ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
++ }
++ if ((want & AVAIL_DL) != 0)
++ {
++ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
++ EXTEND(left + size, *a_l, size);
++ }
++ if ((want & AVAIL_U) != 0)
++ {
++ EXTEND(top, *a_u, size);
++ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
++ }
++ if ((want & AVAIL_UR) != 0)
++ {
++ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
++ EXTEND(top + size, *a_ur, size);
++ }
++
++ if ((have & AVAIL_U) != 0)
++ {
++ filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
++ }
++ if ((have & AVAIL_UR) != 0) {
++ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
++ top[size*2 - 1] = *b_ur;
++ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
++ }
++ if ((have & AVAIL_L) != 0)
++ {
++ filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
++ }
++ if ((have & AVAIL_DL) != 0)
++ {
++ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
++ left[size*2 - 1] = *a_dl;
++ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
++ }
++ }
++}
++
++#define INTRA_FILTER(log2_size) \
++static void FUNC(intra_filter_ ## log2_size)( \
++ uint8_t * const left, uint8_t * const top, \
++ const unsigned int req, const unsigned int avail, \
++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
++ const unsigned int stride, \
++ const unsigned int top_right_size, const unsigned int down_left_size) \
++{ \
++ intra_filter((pixel *)left, (pixel *)top, req, avail, \
++ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
++}
++
++INTRA_FILTER(2)
++INTRA_FILTER(3)
++INTRA_FILTER(4)
++INTRA_FILTER(5)
++
++#undef intra_filter
++#undef INTRA_FILTER
++
++static void FUNC(intra_pred)(const HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
++ const unsigned int log2_size)
++{
++ // c_idx will alaways be 1 for _c versions and 0 for y
++ const unsigned int c_idx = PRED_C;
++ const unsigned int hshift = ctx_hshift(s, c_idx);
++ const unsigned int vshift = ctx_vshift(s, c_idx);
++ const unsigned int size = (1 << log2_size);
++ const unsigned int x = x0 >> hshift;
++ const unsigned int y = y0 >> vshift;
++
++ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
++ pixel *const src = c_idx == 0 ?
++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
++
++ // Align so we can do multiple loads in the asm
++ // Padded to 16 byte boundary so as not to confuse anything
++ DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
++
++ pixel * const left = left_array + 16 / sizeof(pixel);
++ const pixel * top_pred = top;
++
++ const pixel * src_l = src - 1;
++ const pixel * src_u = src - stride;
++ const pixel * src_ur = src_u + size;
++#if !PRED_C
++ const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
++#else
++ const unsigned int req = req_avail_c[mode];
++#endif
++
++ // If we have nothing to pred from then fill with grey
++ // This isn't a common case but dealing with it here means we don't have to
++ // test for it later
++ if (avail == 0)
++ {
++dc_only:
++#if !PRED_C
++ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
++#else
++ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
++#endif
++ return;
++ }
++
++ {
++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
++ const AVFrame * const frame = s->frame;
++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
++ if ((x & mask) == 0)
++ src_l -= stripe_adj;
++ if (((x + size) & mask) == 0)
++ src_ur += stripe_adj;
++ }
++
++ // Can deal with I-slices in 'normal' code even if CIP
++ // This also means that we don't need to generate (elsewhere) is_intra
++ // for IRAP frames
++ if (s->ps.pps->constrained_intra_pred_flag == 1 &&
++ s->sh.slice_type != HEVC_SLICE_I)
++ {
++ // * If we ever actually care about CIP performance then we should
++ // special case out size 4 stuff (can be done by 'normal') and
++ // have 8-pel avail masks
++ unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
++ -(int)(s->ps.sps->pcm_width),
++ 1 << (((x - 1) >> (3 - hshift)) & 7),
++ 1 - hshift,
++ avail,
++ size,
++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
++ vshift != 0 ? 0 : (y >> 2) & 1);
++
++ unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
++ (x >> (3 - hshift)) & 7,
++ 1 - hshift,
++ avail,
++ size,
++ FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
++ hshift != 0 ? 0 : (x >> 2) & 1);
++
++ // Anything left?
++ if ((avail_l | avail_u) == 0)
++ goto dc_only;
++
++ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
++
++#if !PRED_C
++ if ((req & FILTER_LIGHT) != 0)
++ {
++ const unsigned threshold = 1 << (BIT_DEPTH - 5);
++ if ((req & FILTER_STRONG) != 0 &&
++ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold &&
++ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
++ {
++ filter_strong(top, left[-1], top[63], 64);
++ filter_strong(left, left[-1], left[63], 64);
++ } else
++ {
++ // LHS writes UL too so copy for top
++ const pixel p_ul = left[-1];
++ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
++ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
++ }
++ }
++#endif
++ }
++ else
++ {
++ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
++ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
++ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
++ {
++ top_pred = src_u;
++ }
++ else
++ {
++#if !PRED_C
++ s->hpc.intra_filter[log2_size - 2]
++#else
++ s->hpc.intra_filter_c[log2_size - 2]
++#endif
++ ((uint8_t *)left, (uint8_t *)top, req, avail,
++ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
++ ur_size,
++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
++ }
++ }
++
++
++#if !PRED_C
++ switch (mode) {
++ case INTRA_PLANAR:
++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_DC:
++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ default:
++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ }
++#else
++ switch (mode) {
++ case INTRA_PLANAR:
++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_DC:
++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ default:
++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ }
++
++#if DUMP_PRED
++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
++}
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left, ptrdiff_t stride,
++ int trafo_size)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++ int size = 1 << trafo_size;
++ for (y = 0; y < size; y++)
++ for (x = 0; x < size; x++)
++ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] +
++ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1);
++}
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++ const uint8_t * _left, ptrdiff_t stride,
++ int trafo_size)
++{
++ int x, y;
++ int size = 1 << trafo_size;
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const c_src_ptr_t top = (c_src_ptr_t)_top;
++ const c_src_ptr_t left = (c_src_ptr_t)_left;
++
++ for (y = 0; y < size; y++, src += stride)
++ {
++ for (x = 0; x < size; x++)
++ {
++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] +
++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] +
++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++ }
++ }
++}
++#endif
++
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \
++ const uint8_t *left, ptrdiff_t stride) \
++{ \
++ FUNC(pred_planar)(src, top, left, stride, size + 2); \
++}
++
++PRED_PLANAR(0)
++PRED_PLANAR(1)
++PRED_PLANAR(2)
++PRED_PLANAR(3)
++
++#undef PRED_PLANAR
++
++#if !PRED_C
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int log2_size)
++{
++ int i, j, x, y;
++ int size = (1 << log2_size);
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++ int dc = size;
++ pixel4 a;
++ for (i = 0; i < size; i++)
++ dc += left[i] + top[i];
++
++ dc >>= log2_size + 1;
++
++ a = PIXEL_SPLAT_X4(dc);
++
++ for (i = 0; i < size; i++)
++ for (j = 0; j < size; j+=4)
++ AV_WN4P(&POS(j, i), a);
++
++// if (c_idx == 0 && size < 32)
++// As we now have separate fns for y & c - no need to test that
++ if (size < 32)
++ {
++ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
++ for (x = 1; x < size; x++)
++ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
++ for (y = 1; y < size; y++)
++ POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++ }
++}
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int log2_size)
++{
++ unsigned int i, j;
++ const unsigned int size = (1 << log2_size);
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const c_src_ptr_t top = (c_src_ptr_t)_top;
++ const c_src_ptr_t left = (c_src_ptr_t)_left;
++ unsigned int dc0 = size;
++ unsigned int dc1 = size;
++
++ for (i = 0; i < size; i++)
++ {
++ dc0 += left[i][0] + top[i][0];
++ dc1 += left[i][1] + top[i][1];
++ }
++
++ dc0 >>= log2_size + 1;
++ dc1 >>= log2_size + 1;
++
++ for (i = 0; i < size; i++, src += stride)
++ {
++ for (j = 0; j < size; ++j)
++ {
++ src[j][0] = dc0;
++ src[j][1] = dc1;
++
++ }
++ }
++}
++#endif
++
++#define PRED_DC(size)\
++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \
++ const uint8_t *left, ptrdiff_t stride) \
++{ \
++ FUNC(pred_dc)(src, top, left, stride, size + 2); \
++}
++
++PRED_DC(0)
++PRED_DC(1)
++PRED_DC(2)
++PRED_DC(3)
++
++#undef PRED_DC
++
++
++
++
++#if !PRED_C
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++ int i, j;
++ int size = (1 << log2_size);
++ pixel *src = (pixel *)_src;
++ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
++
++ for (i = 0; i < size; i++)
++ for (j = 0; j < size; j+=4)
++ AV_WN4P(&POS(j, i), a);
++}
++#else
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++ unsigned int i, j;
++ const unsigned int size = (1 << log2_size);
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const pixel a = (1 << (BIT_DEPTH - 1));
++
++ for (i = 0; i < size; i++, src += stride)
++ {
++ for (j = 0; j < size; ++j)
++ {
++ src[j][0] = a;
++ src[j][1] = a;
++ }
++ }
++}
++#endif
++
++#define PRED_DC0(size)\
++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \
++{ \
++ FUNC(pred_dc0)(src, stride, size + 2); \
++}
++
++PRED_DC0(0)
++PRED_DC0(1)
++PRED_DC0(2)
++PRED_DC0(3)
++
++#undef PRED_DC0
++
++
++
++
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
++};
++static const int inv_angle[] = {
++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++ -630, -910, -1638, -4096
++};
++#endif
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++ const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride,
++ int mode, int size)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++
++ int angle = intra_pred_angle[mode - 2];
++ pixel ref_array[3 * MAX_TB_SIZE + 4];
++ pixel *ref_tmp = ref_array + size;
++ const pixel *ref;
++ int last = (size * angle) >> 5;
++
++ if (mode >= 18) {
++ ref = top - 1;
++
++ if (angle < 0)
++ {
++ memcpy(ref_tmp + 1, top, size * PW);
++ ref_tmp[0] = left[-1];
++
++ for (x = last; x <= -1; x++)
++ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++ ref = ref_tmp;
++ }
++
++ for (y = 0; y < size; y++) {
++ int idx = ((y + 1) * angle) >> 5;
++ int fact = ((y + 1) * angle) & 31;
++ if (fact) {
++ for (x = 0; x < size; x += 4) {
++ POS(x , y) = ((32 - fact) * ref[x + idx + 1] +
++ fact * ref[x + idx + 2] + 16) >> 5;
++ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
++ fact * ref[x + 1 + idx + 2] + 16) >> 5;
++ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
++ fact * ref[x + 2 + idx + 2] + 16) >> 5;
++ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
++ fact * ref[x + 3 + idx + 2] + 16) >> 5;
++ }
++ } else {
++ for (x = 0; x < size; x += 4)
++ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
++ }
++ }
++ if (mode == 26 && size < 32) {
++ for (y = 0; y < size; y++)
++ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
++ }
++
++ } else {
++ ref = left - 1;
++ if (angle < 0 && last < -1) {
++ for (x = 0; x <= size; x += 4)
++ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
++ // Inv angle <= -256 so top offset >= 0
++ for (x = last; x <= -1; x++)
++ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++ ref = ref_tmp;
++ }
++
++ for (x = 0; x < size; x++) {
++ int idx = ((x + 1) * angle) >> 5;
++ int fact = ((x + 1) * angle) & 31;
++ if (fact) {
++ for (y = 0; y < size; y++) {
++ POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
++ fact * ref[y + idx + 2] + 16) >> 5;
++ }
++ } else {
++ for (y = 0; y < size; y++)
++ POS(x, y) = ref[y + idx + 1];
++ }
++ }
++ if (mode == 10 && size < 32) {
++ for (x = 0; x < size; x += 4) {
++ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - left[-1]) >> 1));
++ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
++ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
++ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
++ }
++ }
++ }
++}
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++ const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride,
++ int mode, int size)
++{
++ int x, y;
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ c_src_ptr_t top = (c_src_ptr_t)_top;
++ c_src_ptr_t left = (c_src_ptr_t)_left;
++
++ const int angle = intra_pred_angle[mode - 2];
++ cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++ c_dst_ptr_t ref_tmp = ref_array + size;
++ c_src_ptr_t ref;
++ const int last = (size * angle) >> 5;
++
++ if (mode >= 18) {
++ ref = top - 1;
++ if (angle < 0) {
++ memcpy(ref_tmp + 1, top, size * 2 * PW);
++ ref_tmp[0][0] = left[-1][0];
++ ref_tmp[0][1] = left[-1][1];
++ for (x = last; x <= -1; x++)
++ {
++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++ }
++ ref = (c_src_ptr_t)ref_tmp;
++ }
++
++ for (y = 0; y < size; y++, src += stride) {
++ const int idx = ((y + 1) * angle) >> 5;
++ const int fact = ((y + 1) * angle) & 31;
++ if (fact) {
++ for (x = 0; x < size; ++x) {
++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++ fact * ref[x + idx + 2][0] + 16) >> 5;
++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++ fact * ref[x + idx + 2][1] + 16) >> 5;
++ }
++ } else {
++ memcpy(src, ref + idx + 1, size * 2 * PW);
++ }
++ }
++ } else {
++ ref = left - 1;
++ if (angle < 0 && last < -1) {
++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
++ for (x = last; x <= -1; x++)
++ {
++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++ }
++ ref = (c_src_ptr_t)ref_tmp;
++ }
++
++ for (x = 0; x < size; x++, src++) {
++ const int idx = ((x + 1) * angle) >> 5;
++ const int fact = ((x + 1) * angle) & 31;
++ if (fact) {
++ for (y = 0; y < size; y++) {
++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++ fact * ref[y + idx + 2][0] + 16) >> 5;
++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++ fact * ref[y + idx + 2][1] + 16) >> 5;
++ }
++ } else {
++ for (y = 0; y < size; y++)
++ {
++ src[y * stride][0] = ref[y + idx + 1][0];
++ src[y * stride][1] = ref[y + idx + 1][1];
++ }
++ }
++ }
++ }
++}
++#endif
++
++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
++}
++
++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
++}
++
++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
++}
++
++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
++}
++
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
++
++#undef EXTEND
++#undef POS
++#undef PW
++
++#undef filter_light1
++#undef filter_light
++#undef filter_strong
++#undef ref_gen
++
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,155 @@
++/*
++Copyright (c) 2012, Broadcom Europe Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <stdint.h>
++#include <sys/ioctl.h>
++
++#include <linux/ioctl.h>
++
++#define MAJOR_NUM 100
++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
++#define DEVICE_FILE_NAME "/dev/vcio"
++
++#include "rpi_mailbox.h"
++//#include <interface/vctypes/vc_image_structs.h>
++
++/*
++ * use ioctl to send mbox property message
++ */
++
++static int mbox_property(int file_desc, void *buf)
++{
++ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
++
++ if (ret_val < 0) {
++ printf("ioctl_set_msg failed:%d\n", ret_val);
++ }
++
++#ifdef DEBUG
++ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++ for (i=0; i<size/4; i++)
++ printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
++#endif
++ return ret_val;
++}
++
++#define GET_VCIMAGE_PARAMS 0x30044
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
++{
++ uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++ uint32_t * p = buf;
++ void * rimg;
++ int rv;
++
++ *p++ = 0; // size
++ *p++ = 0; // process request
++ *p++ = GET_VCIMAGE_PARAMS;
++ *p++ = sizeof(*img);
++ *p++ = sizeof(*img);
++ rimg = p;
++ memcpy(p, img, sizeof(*img));
++ p += sizeof(*img) / sizeof(*p);
++ *p++ = 0; // End tag
++ buf[0] = (p - buf) * sizeof(*p);
++
++ rv = mbox_property(fd, buf);
++ memcpy(img, rimg, sizeof(*img));
++
++ return rv;
++}
++
++
++#define SET_CLOCK_RATE 0x00038002
++#define GET_MAX_CLOCK 0x00030004
++#define CLOCK_HEVC 11
++
++static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
++{
++ uint32_t buf[32];
++ uint32_t * p = buf;
++ int rv;
++
++ *p++ = 0; // size
++ *p++ = 0; // process request
++ *p++ = command;
++ *p++ = 8;
++ *p++ = 8;
++ *p++ = *word0;
++ *p++ = *word1;
++ *p++ = 0; // End tag
++ buf[0] = (p - buf) * sizeof(*p);
++
++ rv = mbox_property(fd, buf);
++ *word0 = buf[6];
++ *word1 = buf[7];
++ return rv;
++}
++
++int mbox_open() {
++ int file_desc;
++
++ // open a char device file used for communicating with kernel mbox driver
++ file_desc = open(DEVICE_FILE_NAME, 0);
++ if (file_desc < 0) {
++ printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
++ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
++ }
++ return file_desc;
++}
++
++void mbox_close(int file_desc) {
++ close(file_desc);
++}
++
++int mbox_request_clock(int fd) {
++ int rv;
++ unsigned word0, word1 = 0;
++ word0 = CLOCK_HEVC;
++ rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
++ if (rv != 0)
++ return rv;
++ word1 = word0;
++ word0 = CLOCK_HEVC;
++ rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++ return rv;
++}
++
++int mbox_release_clock(int fd) {
++ int rv;
++ unsigned word0, word1 = 0;
++ word0 = CLOCK_HEVC;
++ word1 = 0;
++ rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++ return rv;
++}
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,58 @@
++#ifndef RPI_MAILBOX_H
++#define RPI_MAILBOX_H
++
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++ void *u, *v;
++ int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++ VC_IMAGE_EXTRA_UV_T uv;
++// VC_IMAGE_EXTRA_RGBA_T rgba;
++// VC_IMAGE_EXTRA_PAL_T pal;
++// VC_IMAGE_EXTRA_TF_T tf;
++// VC_IMAGE_EXTRA_BAYER_T bayer;
++// VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++// VC_IMAGE_EXTRA_CODEC_T codec;
++// VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
++
++
++typedef struct VC_IMAGE_T {
++ unsigned short type; /* should restrict to 16 bits */
++ unsigned short info; /* format-specific info; zero for VC02 behaviour */
++ unsigned short width; /* width in pixels */
++ unsigned short height; /* height in pixels */
++ int pitch; /* pitch of image_data array in bytes */
++ int size; /* number of bytes available in image_data array */
++ void *image_data; /* pixel data */
++ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */
++ void *metadata; /* metadata header for the image */
++ void *pool_object; /* nonNULL if image was allocated from a vc_pool */
++ int mem_handle; /* the mem handle for relocatable memory storage */
++ int metadata_size; /* size of metadata of each channel in bytes */
++ int channel_offset; /* offset of consecutive channels in bytes */
++ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++ uint8_t num_channels; /* number of channels (2 for stereo) */
++ uint8_t current_channel;/* the channel this header is currently pointing to */
++ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header
++ into a linked-mulitchannel image */
++ uint8_t channel_index; /* index of the channel this header represents while
++ it is being linked. */
++ uint8_t _dummy[3]; /* pad struct to 64 bytes */
++} VC_IMAGE_T;
++
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
++
++extern int mbox_open(void);
++extern void mbox_close(int file_desc);
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
++
++int mbox_request_clock(int fd);
++int mbox_release_clock(int fd);
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_mem.c
+@@ -0,0 +1,326 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++
++#include "config.h"
++
++#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++
++#define OPT_PREFER_CMA 0
++
++struct rpi_cache_flush_env_s {
++ struct vcsm_user_clean_invalid2_s v;
++};
++
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++ if (p->arm != NULL)
++ vcsm_unlock_ptr(p->arm);
++ if (p->vcsm_handle != 0)
++ vcsm_free(p->vcsm_handle);
++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++ const int numbytes, const unsigned int cache_type, const char * const name)
++{
++ memset(p, 0, sizeof(*p));
++ p->numbytes = (numbytes + 255) & ~255; // Round up
++
++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
++ goto fail;
++ }
++ if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
++ goto fail;
++ }
++ if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
++ goto fail;
++ }
++ if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
++ goto fail;
++ }
++
++ return 0;
++
++fail:
++ gpu_free_internal(p);
++ return AVERROR(ENOMEM);
++}
++
++// Public gpu fns
++
++// Allocate memory on GPU
++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
++// Returns 0 on success.
++// This allocates memory that will not be cached in ARM's data cache.
++// Therefore safe to use without data cache flushing.
++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
++{
++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
++}
++
++// This allocates data that will be
++// Cached in ARM L2
++// Uncached in VPU L2
++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
++{
++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
++}
++
++void gpu_free(GPU_MEM_PTR_T * const p) {
++ gpu_free_internal(p);
++}
++
++void rpi_mem_gpu_uninit(void)
++{
++ vcsm_exit();
++ bcm_host_deinit();
++}
++
++int rpi_mem_gpu_init(const unsigned int flags)
++{
++ const int wants_cma = bcm_host_is_fkms_active();
++ int use_cma;
++
++ (void)flags;
++
++ if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
++ use_cma = 1;
++ else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
++ use_cma = 0;
++ else
++ return AVERROR(EINVAL);
++
++ bcm_host_init();
++
++ return use_cma + 1;
++}
++
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
++
++#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
++{
++ rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
++ *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
++ return rfe;
++}
++
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++ // Nothing needed
++}
++
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
++{
++ int rc = 0;
++ if (rfe->v.op_count != 0) {
++ if (vcsm_clean_invalid2(&rfe->v) != 0)
++ {
++ const int err = errno;
++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
++ rc = AVERROR(err);
++ }
++ rfe->v.op_count = 0;
++ }
++ return rc;
++}
++
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++ int rc = rpi_cache_flush_execute(rfe);;
++
++ return rc;
++}
++
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
++{
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++
++ av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
++
++ b->invalidate_mode = mode;
++ b->block_count = blocks;
++ b->start_address = gm->arm + offset0;
++ b->block_size = block_size;
++ b->inter_block_stride = block_stride;
++}
++
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset, const unsigned int size)
++{
++ // Deal with empty pointer trivially
++ if (gm == NULL || size == 0)
++ return;
++
++ av_assert1(offset <= gm->numbytes);
++ av_assert1(size <= gm->numbytes);
++ av_assert1(offset + size <= gm->numbytes);
++
++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
++}
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
++
++
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++ if (gpu_is_buf1(frame)) {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++ }
++ else
++ {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++ }
++}
++
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++ const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++ const unsigned int y_offset = frame->linesize[0] * y0;
++ const unsigned int y_size = frame->linesize[0] * height;
++ // Round UV up/down to get everything
++ const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
++
++#if 0
++ // *** frame->height is cropped height so not good
++ // As all unsigned they will also reject -ve
++ // Test individually as well as added to reject overflow
++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped
++ av_assert0(n <= (unsigned int)frame->height);
++ av_assert0(start_line + n <= (unsigned int)frame->height);
++#endif
++
++ if (!gpu_is_buf1(frame))
++ {
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++ }
++ }
++ else if (!av_rpi_is_sand_frame(frame))
++ {
++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++ }
++ }
++ else
++ {
++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C
++ av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
++
++ if (do_chroma)
++ {
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++ b->invalidate_mode = mode;
++ b->block_count = block_count;
++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++ b->block_size = uv_size;
++ b->inter_block_stride = stride1 * stride2;
++ }
++ if (do_luma)
++ {
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++ b->invalidate_mode = mode;
++ b->block_count = block_count;
++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++ b->block_size = y_size;
++ b->inter_block_stride = stride1 * stride2;
++ }
++ }
++}
++
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++ rpi_cache_flush_finish(rfe);
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_mem.h
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_MEM_H
++#define RPI_MEM_H
++
++typedef struct gpu_mem_ptr_s {
++ unsigned char *arm; // Pointer to memory mapped on ARM side
++ int vc_handle; // Videocore handle of relocatable memory
++ int vcsm_handle; // Handle for use by VCSM
++ int vc; // Address for use in GPU code
++ int numbytes; // Size of memory block
++} GPU_MEM_PTR_T;
++
++// General GPU functions
++
++#define GPU_INIT_GPU 1
++#define GPU_INIT_CMA 2
++
++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
++int rpi_mem_gpu_init(const unsigned int flags);
++void rpi_mem_gpu_uninit(void);
++
++// Cache flush stuff
++
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
++
++typedef struct {uint32_t t[33];} rpi_cache_buf_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & clear but do not free the env
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1,
++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2,
++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3
++} rpi_cache_flush_mode_t;
++
++struct AVFrame;
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++ const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++ const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_qpu.c
+@@ -0,0 +1,776 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++#include "libavutil/avassert.h"
++
++#include "config.h"
++
++#include <pthread.h>
++#include <time.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include "rpi_mailbox.h"
++#include "rpi_mem.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
++
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0
++
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL 0
++
++// QPU "noflush" flags
++// a mixture of flushing & profiling
++
++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed)
++
++#define vcos_verify_ge0(x) ((x)>=0)
++
++// Size in 32bit words
++#define QPU_CODE_SIZE 4098
++#define VPU_CODE_SIZE 16384
++
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
++{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
++{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
++{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
++{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
++{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
++{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
++{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
++{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
++{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
++{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
++{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
++{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
++{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
++{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
++{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
++{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
++// Odd rows
++{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
++{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
++{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
++{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
++{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
++{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
++{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
++{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
++{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
++{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
++{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
++{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
++{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
++{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
++{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
++{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
++};
++
++// Code/constants on GPU
++struct GPU
++{
++// unsigned int qpu_code[QPU_CODE_SIZE];
++ unsigned int vpu_code8[VPU_CODE_SIZE];
++ unsigned int vpu_code10[VPU_CODE_SIZE];
++ short transMatrix2even[16*16*2];
++};
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++ int count;
++ int64_t start[WAIT_COUNT_MAX];
++ int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++ unsigned int jcount;
++ int64_t start0;
++ int64_t last_update;
++ trace_time_one_t active;
++ trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++ sem_t sem;
++ struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++ vq_wait_t * head;
++ vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++ int open_count;
++ int init_count;
++ int vpu_i_cache_flushed;
++ GPU_MEM_PTR_T qpu_code_gm_ptr;
++ GPU_MEM_PTR_T code_gm_ptr;
++ GPU_MEM_PTR_T dummy_gm_ptr;
++ vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
++// Stop more than one thread trying to allocate memory or use the processing resources at once
++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
++static gpu_env_t * gpu = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
++ struct timespec ts;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
++}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++ // Update totals for levels that are still pending
++ for (int i = 0; i < tto->count; ++i) {
++ tto->total[i] += now - tto->start[i];
++ tto->start[i] = now;
++ }
++
++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++ prefix,
++ T_ARG(now - start0 - tto->total[0]),
++ T_ARG(tto->total[0]),
++ T_ARG(tto->total[1]),
++ T_ARG(tto->total[2]),
++ T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++ av_assert0(tto->count < WAIT_COUNT_MAX);
++ tto->start[tto->count++] = now;
++}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++ const int n = --tto->count;
++ av_assert0(n >= 0);
++ tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++ tto_print(&ttw->active, now, ttw->start0, "Active");
++ tto_print(&ttw->wait, now, ttw->start0, " Wait");
++}
++
++#endif
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++ if (p->arm != NULL)
++ vcsm_unlock_ptr(p->arm);
++ if (p->vcsm_handle != 0)
++ vcsm_free(p->vcsm_handle);
++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++ const int numbytes, const unsigned int cache_type, const char * const name)
++{
++ memset(p, 0, sizeof(*p));
++ p->numbytes = (numbytes + 255) & ~255; // Round up
++
++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
++ (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
++ (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
++ (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++ {
++ gpu_free_internal(p);
++ return AVERROR(ENOMEM);
++ }
++ return 0;
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++ gpu_env_t * const ge = gpu;
++
++ // We have to hope that eveything has terminated...
++ gpu = NULL;
++
++ vc_gpuserv_deinit();
++
++ gpu_free_internal(&ge->code_gm_ptr);
++ gpu_free_internal(&ge->qpu_code_gm_ptr);
++ gpu_free_internal(&ge->dummy_gm_ptr);
++
++ vcsm_exit();
++
++ vq_wait_pool_deinit(&ge->wait_pool);
++
++ free(ge);
++}
++
++
++// Connect to QPU, returns 0 on success.
++static int gpu_init(gpu_env_t ** const gpu) {
++ volatile struct GPU* ptr;
++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++ int rv;
++ *gpu = NULL;
++
++ if (ge == NULL)
++ return -1;
++
++ vq_wait_pool_init(&ge->wait_pool);
++
++ vcsm_init();
++
++ // Now copy over the QPU code into GPU memory
++ if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
++ return rv;
++
++ {
++ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
++ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
++ memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
++ }
++
++ // And the VPU code
++ if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
++ return rv;
++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
++
++ // Zero everything so we have zeros between the code bits
++ memset((void *)ptr, 0, sizeof(*ptr));
++ {
++ int num_bytes = sizeof(rpi_hevc_transform8);
++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++ }
++ {
++ int num_bytes = sizeof(rpi_hevc_transform10);
++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
++ }
++ // And the transform coefficients
++ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
++
++ // Generate a dummy "frame" & fill with 0x80
++ // * Could reset to 1 <<bit_depth?
++ if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
++ return rv;
++ memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
++
++ *gpu = ge;
++ return 0;
++}
++
++
++
++static void gpu_unlock(void) {
++ pthread_mutex_unlock(&gpu_mutex);
++}
++
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++ pthread_mutex_lock(&gpu_mutex);
++
++ av_assert1(gpu != NULL);
++ return gpu;
++}
++
++static gpu_env_t * gpu_lock_ref(void)
++{
++ pthread_mutex_lock(&gpu_mutex);
++
++ if (gpu == NULL) {
++ int rv = gpu_init(&gpu);
++ if (rv != 0) {
++ gpu_unlock();
++ return NULL;
++ }
++ }
++
++ ++gpu->open_count;
++ return gpu;
++}
++
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++ if (--ge->open_count == 0)
++ gpu_term();
++
++ gpu_unlock();
++}
++
++static inline gpu_env_t * gpu_ptr(void)
++{
++ av_assert1(gpu != NULL);
++ return gpu;
++}
++
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
++ uint32_t a = 0;
++
++ // Make sure that the gpu is initialized
++ av_assert1(gpu != NULL);
++ switch (bit_depth){
++ case 8:
++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++ break;
++ case 10:
++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++ break;
++ default:
++ av_assert0(0);
++ }
++ return a;
++}
++
++unsigned int vpu_get_constants(void) {
++ av_assert1(gpu != NULL);
++ return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
++}
++
++void gpu_ref(void)
++{
++ gpu_lock_ref();
++ gpu_unlock();
++}
++
++void gpu_unref(void)
++{
++ gpu_env_t * const ge = gpu_lock();
++ gpu_unlock_unref(ge);
++}
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
++{
++ unsigned int i;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_init(&wp->pool[i].sem, 0, 0);
++ wp->pool[i].next = wp->pool + i + 1;
++ }
++ wp->head = wp->pool + 0;
++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
++}
++
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
++{
++ unsigned int i;
++ wp->head = NULL;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_destroy(&wp->pool[i].sem);
++ wp->pool[i].next = NULL;
++ }
++}
++
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(void)
++{
++ gpu_env_t * const ge = gpu_lock_ref();
++ vq_wait_t * const wait = ge->wait_pool.head;
++ ge->wait_pool.head = wait->next;
++ wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ tto_start(&ge->ttw.active, ns_time());
++#endif
++
++ gpu_unlock();
++ return wait;
++}
++
++static void vq_wait_delete(vq_wait_t * const wait)
++{
++ gpu_env_t * const ge = gpu_lock();
++ wait->next = ge->wait_pool.head;
++ ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ trace_time_wait_t * const ttw = &ge->ttw;
++ const int64_t now = ns_time();
++ ++ttw->jcount;
++ tto_end(&ttw->wait, now);
++
++ if (ttw->start0 == 0)
++ {
++ ttw->start0 = ttw->active.start[0];
++ ttw->last_update = ttw->start0;
++ }
++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++ {
++ ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++ ttw_print(ttw, now);
++ }
++ }
++#endif
++ gpu_unlock_unref(ge);
++}
++
++static void vq_wait_wait(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ const int64_t now = ns_time();
++ gpu_env_t * const ge = gpu_lock();
++ tto_start(&ge->ttw.wait, now);
++ gpu_unlock();
++ }
++#endif
++
++ while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++ /* loop */;
++}
++
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ gpu_env_t *const ge = gpu_lock();
++ tto_end(&ge->ttw.active, ns_time());
++ gpu_unlock();
++ }
++#endif
++
++ sem_post(&wait->sem);
++}
++
++
++
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU 1
++#define VPU_QPU_MASK_VPU 2
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
++{
++// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++ vpu_qpu_job_env_t * vqj = buf;
++// memset(vqj, 0, sizeof(*vqj));
++ vqj->n = 0;
++ vqj->mask = 0;
++ return vqj;
++}
++
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
++{
++// memset(vqj, 0, sizeof(*vqj));
++// free(vqj);
++}
++
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++ struct gpu_job_s * const j = vqj->j + vqj->n++;
++ av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
++ return j;
++}
++
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++ if (vpu_code != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_VPU;
++
++ j->command = EXECUTE_VPU;
++ j->callback.func = 0;
++ j->callback.cookie = NULL;
++ // The bottom two bits of the execute address contain no-flush flags
++ // b0 will flush the VPU I-cache if unset so we nearly always want that set
++ // as we never reload code
++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
++ j->u.v.q[1] = r0;
++ j->u.v.q[2] = r1;
++ j->u.v.q[3] = r2;
++ j->u.v.q[4] = r3;
++ j->u.v.q[5] = r4;
++ j->u.v.q[6] = r5;
++ gpu->vpu_i_cache_flushed = 1;
++ }
++}
++
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
++{
++ if (n != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_QPU;
++
++ j->command = EXECUTE_QPU;
++ j->callback.func = 0;
++ j->callback.cookie = NULL;
++
++ j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
++#else
++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++#endif
++ j->u.q.timeout = 5000;
++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++ }
++}
++
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
++{
++ vq_wait_post(v);
++}
++
++// Poke a user-supplied sem
++static void vpu_qpu_job_callback_sem(void * v)
++{
++ sem_post((sem_t *)v);
++}
++
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++ vq_wait_t * wait;
++
++ if (vqj->mask == 0) {
++ *wait_h = NULL;
++ return;
++ }
++
++ // We are going to want a sync object
++ wait = vq_wait_new();
++
++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++ // If we only posted one thing or only QPU jobs
++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++ {
++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++ av_assert1(j->callback.func == 0);
++
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
++ else
++ {
++ struct gpu_job_s *const j = new_job(vqj);
++
++ j->command = EXECUTE_SYNC;
++ j->u.s.mask = vqj->mask;
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
++
++ vqj->mask = 0;
++ *wait_h = wait;
++}
++
++// Returns 0 if no sync added ('cos Q empty), 1 if sync added
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
++{
++ // If nothing on q then just return
++ if (vqj->mask == 0)
++ return 0;
++
++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++ // If we only posted one thing or only QPU jobs
++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++ {
++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++ av_assert1(j->callback.func == 0);
++
++ j->callback.func = vpu_qpu_job_callback_sem;
++ j->callback.cookie = sem;
++ }
++ else
++ {
++ struct gpu_job_s *const j = new_job(vqj);
++
++ j->command = EXECUTE_SYNC;
++ j->u.s.mask = vqj->mask;
++ j->callback.func = vpu_qpu_job_callback_sem;
++ j->callback.cookie = sem;
++ }
++
++ vqj->mask = 0;
++ return 1;
++}
++
++
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++ if (vqj->n == 0)
++ return 0;
++
++ return vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
++
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++ int rv;
++ rv = vpu_qpu_job_start(vqj);
++ vpu_qpu_job_delete(vqj);
++ return rv;
++}
++
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++ if (wait_h != NULL)
++ {
++ vq_wait_t * const wait = *wait_h;
++ if (wait != NULL) {
++ *wait_h = NULL;
++ vq_wait_wait(wait);
++ vq_wait_delete(wait);
++ }
++ }
++}
++
++int vpu_qpu_init()
++{
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
++
++ if (ge->init_count++ == 0)
++ {
++ vc_gpuserv_init();
++ }
++
++ gpu_unlock();
++ return 0;
++}
++
++void vpu_qpu_term()
++{
++ gpu_env_t * const ge = gpu_lock();
++
++ if (--ge->init_count == 0) {
++ vc_gpuserv_deinit();
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ ttw_print(&ge->ttw, ns_time());
++#endif
++ }
++
++ gpu_unlock_unref(ge);
++}
++
++uint32_t qpu_fn(const int * const mc_fn)
++{
++ return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
++}
++
++uint32_t qpu_dummy(void)
++{
++ return gpu->dummy_gm_ptr.vc;
++}
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
++{
++ // Dummy values we can catch with emulation
++ qf->y_pxx = ~1U;
++ qf->y_bxx = ~2U;
++ qf->y_p00 = ~3U;
++ qf->y_b00 = ~4U;
++ qf->c_pxx = ~5U;
++ qf->c_bxx = ~6U;
++
++ switch (bit_depth) {
++ case 8:
++ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++ qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++ qf->y_p00 = qpu_fn(mc_filter_y_p00);
++ qf->y_b00 = qpu_fn(mc_filter_y_b00);
++ qf->c_pxx = qpu_fn(mc_filter_c_p);
++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++ qf->c_bxx = qpu_fn(mc_filter_c_b);
++ break;
++ case 10:
++ qf->c_pxx = qpu_fn(mc_filter_c10_p);
++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++ qf->c_bxx = qpu_fn(mc_filter_c10_b);
++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++ qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++ qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++ break;
++ default:
++ return -1;
++ }
++ return 0;
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_qpu.h
+@@ -0,0 +1,103 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_QPU_H
++#define RPI_QPU_H
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#pragma GCC diagnostic ignored "-Wstrict-prototypes"
++#include "interface/vmcs_host/vc_vchi_gpuserv.h" // for gpu_job_s
++#pragma GCC diagnostic pop
++
++// QPU specific functions
++
++typedef struct HEVCRpiQpu {
++ uint32_t c_pxx;
++ uint32_t c_pxx_l1;
++ uint32_t c_bxx;
++ uint32_t y_pxx;
++ uint32_t y_bxx;
++ uint32_t y_p00;
++ uint32_t y_b00;
++} HEVCRpiQpu;
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
++
++uint32_t qpu_fn(const int * const mc_fn);
++uint32_t qpu_dummy(void);
++
++#define QPU_N_GRP 4
++#define QPU_N_MAX 12
++
++#define QPU_MAIL_EL_VALS 2
++
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
++
++// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
++{
++ unsigned int n;
++ unsigned int mask;
++ struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
++extern unsigned int vpu_get_constants(void);
++
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
++
++void gpu_ref(void);
++void gpu_unref(void);
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_zc.c
+@@ -0,0 +1,1227 @@
++#include "config.h"
++
++#include "libavcodec/avcodec.h"
++#include "rpi_mem.h"
++#include "rpi_mailbox.h"
++#include "rpi_zc.h"
++#include "libavutil/avassert.h"
++#include <pthread.h>
++
++#include "libavutil/buffer_internal.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#define TRACE_ALLOC 0
++#define DEBUG_ALWAYS_KEEP_LOCKED 0
++
++struct ZcPoolEnt;
++
++typedef struct ZcPool
++{
++ size_t numbytes;
++ struct ZcPoolEnt * head;
++ pthread_mutex_t lock;
++} ZcPool;
++
++typedef struct ZcPoolEnt
++{
++ size_t numbytes;
++
++ unsigned int vcsm_handle;
++ unsigned int vc_handle;
++ void * map_arm;
++ unsigned int map_vc;
++
++ struct ZcPoolEnt * next;
++ struct ZcPool * pool;
++} ZcPoolEnt;
++
++typedef struct ZcOldCtxVals
++{
++ int thread_safe_callbacks;
++ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
++ void * opaque;
++} ZcOldCtxVals;
++
++typedef struct AVZcEnv
++{
++ unsigned int refcount;
++ ZcOldCtxVals old;
++
++ void * pool_env;
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf;
++ av_rpi_zc_free_pool_fn_t * free_pool;
++
++ unsigned int pool_size;
++} ZcEnv;
++
++typedef struct ZcUserBufEnv {
++ void * v;
++ const av_rpi_zc_buf_fn_tab_t * fn;
++ size_t numbytes;
++ int offset;
++} ZcUserBufEnv;
++
++#define ZC_BUF_INVALID 0
++#define ZC_BUF_VALID 1
++#define ZC_BUF_NEVER 2
++
++typedef struct ZcBufEnv {
++ GPU_MEM_PTR_T gmem;
++ AVZcEnvPtr zc;
++ int is_valid;
++ AVBufferRef * user;
++ AVRpiZcFrameGeometry geo;
++ size_t size_y;
++ size_t size_c;
++ size_t size_pic;
++ ssize_t offset;
++ pthread_mutex_t lock;
++ pthread_cond_t cond;
++} ZcBufEnv;
++
++
++
++
++
++
++#define ALLOC_PAD 0
++#define ALLOC_ROUND 0x1000
++#define STRIDE_ROUND 64
++#define STRIDE_OR 0
++
++#define DEBUG_ZAP0_BUFFERS 0
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
++ (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++ return av_rpi_is_sand_format(frame->format);
++}
++
++//----------------------------------------------------------------------------
++//
++// Internal pool stuff
++
++// Pool entry functions
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
++{
++ ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
++
++ // Round up to 4k & add 4k
++ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
++ if (zp == NULL) {
++ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
++ goto fail0;
++ }
++
++ // The 0x80 here maps all pages here rather than waiting for lazy mapping
++ // BEWARE that in GPU land a later unlock/lock pair will put us back into
++ // lazy mode - which will also break cache invalidate calls.
++ if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
++ goto fail1;
++ }
++
++#if TRACE_ALLOC
++ printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
++#endif
++
++ zp->numbytes = alloc_size;
++ zp->pool = pool;
++ return zp;
++
++fail1:
++ av_free(zp);
++fail0:
++ return NULL;
++}
++
++static void zc_pool_ent_free(ZcPoolEnt * const zp)
++{
++#if TRACE_ALLOC
++ printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
++#endif
++
++ if (zp->vcsm_handle != 0)
++ {
++ // VC addr & handle need no dealloc
++ if (zp->map_arm != NULL)
++ vcsm_unlock_hdl(zp->vcsm_handle);
++ vcsm_free(zp->vcsm_handle);
++ }
++ av_free(zp);
++}
++
++//----------------------------------------------------------------------------
++//
++// Pool functions
++
++static void zc_pool_free_ent_list(ZcPoolEnt * p)
++{
++ while (p != NULL)
++ {
++ ZcPoolEnt * const zp = p;
++ p = p->next;
++ zc_pool_ent_free(zp);
++ }
++}
++
++static void zc_pool_flush(ZcPool * const pool)
++{
++ ZcPoolEnt * p = pool->head;
++ pool->head = NULL;
++ pool->numbytes = ~0U;
++ zc_pool_free_ent_list(p);
++}
++
++static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
++{
++ ZcPoolEnt * zp = NULL;
++ ZcPoolEnt * flush_list = NULL;
++ size_t numbytes;
++
++ pthread_mutex_lock(&pool->lock);
++
++ numbytes = pool->numbytes;
++
++ // If size isn't close then dump the pool
++ // Close in this context means within 128k
++ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
++ {
++ flush_list = pool->head;
++ pool->head = NULL;
++ pool->numbytes = numbytes = req_bytes;
++ }
++ else if (pool->head != NULL)
++ {
++ zp = pool->head;
++ pool->head = zp->next;
++ }
++
++ pthread_mutex_unlock(&pool->lock);
++
++ zc_pool_free_ent_list(flush_list);
++
++ if (zp == NULL)
++ zp = zc_pool_ent_alloc(pool, numbytes);
++
++ return zp;
++}
++
++static void zc_pool_put_ent(ZcPoolEnt * const zp)
++{
++ ZcPool * const pool = zp == NULL ? NULL : zp->pool;
++ if (zp != NULL)
++ {
++ pthread_mutex_lock(&pool->lock);
++#if TRACE_ALLOC
++ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
++#endif
++
++ if (pool->numbytes == zp->numbytes)
++ {
++ zp->next = pool->head;
++ pool->head = zp;
++ pthread_mutex_unlock(&pool->lock);
++ }
++ else
++ {
++ pthread_mutex_unlock(&pool->lock);
++ zc_pool_ent_free(zp);
++ }
++ }
++}
++
++static ZcPool *
++zc_pool_new(void)
++{
++ ZcPool * const pool = av_mallocz(sizeof(*pool));
++ if (pool == NULL)
++ return NULL;
++
++ pool->numbytes = -1;
++ pool->head = NULL;
++ pthread_mutex_init(&pool->lock, NULL);
++ return pool;
++}
++
++static void
++zc_pool_delete(ZcPool * const pool)
++{
++ if (pool != NULL)
++ {
++ pool->numbytes = -1;
++ zc_pool_flush(pool);
++ pthread_mutex_destroy(&pool->lock);
++ av_free(pool);
++ }
++}
++
++//============================================================================
++//
++// ZC implementation using above pool implementation
++//
++// Fn table fns...
++
++static void zc_pool_free_v(void * v)
++{
++ zc_pool_put_ent(v);
++}
++
++static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ return zp->vcsm_handle;
++}
++
++static unsigned int zc_pool_ent_vc_handle_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ if (zp->vc_handle == 0)
++ {
++ if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
++ __func__, zp->vcsm_handle);
++ }
++ return zp->vc_handle;
++}
++
++static void * zc_pool_ent_map_arm_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ if (zp->map_arm == NULL)
++ {
++ if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
++ __func__, zp->vcsm_handle);
++ }
++ return zp->map_arm;
++}
++
++static unsigned int zc_pool_ent_map_vc_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ if (zp->map_vc == 0)
++ {
++ if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
++ __func__, zp->vcsm_handle);
++ }
++ return zp->map_vc;
++}
++
++static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
++ .free = zc_pool_free_v,
++ .vcsm_handle = zc_pool_ent_vcsm_handle_v,
++ .vc_handle = zc_pool_ent_vc_handle_v,
++ .map_arm = zc_pool_ent_map_arm_v,
++ .map_vc = zc_pool_ent_map_vc_v,
++};
++
++// ZC Env fns
++
++// Delete pool
++// All buffers guaranteed freed by now
++static void
++zc_pool_delete_v(void * v)
++{
++ zc_pool_delete((ZcPool *)v);
++ rpi_mem_gpu_uninit();
++}
++
++// Allocate a new ZC buffer
++static AVBufferRef *
++zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
++{
++ ZcPool * const pool = v;
++ ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
++ AVBufferRef * buf;
++
++ (void)geo; // geo ignored here
++
++ if (zp == NULL) {
++ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
++ goto fail0;
++ }
++
++ if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
++ goto fail2;
++ }
++
++ return buf;
++
++fail2:
++ zc_pool_put_ent(zp);
++fail0:
++ return NULL;
++}
++
++// Init wrappers - the public fns
++
++AVZcEnvPtr
++av_rpi_zc_int_env_alloc(void * logctx)
++{
++ ZcEnv * zc;
++ ZcPool * pool_env;
++
++ if (rpi_mem_gpu_init(0) < 0)
++ return NULL;
++
++ if ((pool_env = zc_pool_new()) == NULL)
++ goto fail1;
++
++ if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
++ goto fail2;
++
++ return zc;
++
++fail2:
++ zc_pool_delete(pool_env);
++fail1:
++ rpi_mem_gpu_uninit();
++ return NULL;
++}
++
++void
++av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
++{
++ const AVZcEnvPtr zc = *zcp;
++ *zcp = NULL;
++ if (zc != NULL)
++ av_rpi_zc_env_release(zc);
++}
++
++//============================================================================
++//
++// Geometry
++//
++// This is a separate chunck to the rest
++
++// Get mailbox fd - should be in a lock when called
++// Rely on process close to close it
++static int mbox_fd(void)
++{
++ static int fd = -1;
++ if (fd != -1)
++ return fd;
++ return (fd = mbox_open());
++}
++
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++ const int format, const unsigned int video_width, const unsigned int video_height)
++{
++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++
++ AVRpiZcFrameGeometry geo = {
++ .format = format,
++ .video_width = video_width,
++ .video_height = video_height
++ };
++
++ switch (format)
++ {
++ case AV_PIX_FMT_YUV420P:
++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++ geo.stride_c = geo.stride_y / 2;
++ geo.height_y = (video_height + 32 + 31) & ~31;
++ geo.height_c = geo.height_y / 2;
++ geo.planes_c = 2;
++ geo.stripes = 1;
++ geo.bytes_per_pel = 1;
++ geo.stripe_is_yc = 1;
++ break;
++
++ case AV_PIX_FMT_YUV420P10:
++ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++ geo.stride_c = geo.stride_y / 2;
++ geo.height_y = (video_height + 32 + 31) & ~31;
++ geo.height_c = geo.height_y / 2;
++ geo.planes_c = 2;
++ geo.stripes = 1;
++ geo.bytes_per_pel = 2;
++ geo.stripe_is_yc = 1;
++ break;
++
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ {
++ const unsigned int stripe_w = 128;
++
++ static VC_IMAGE_T img = {0};
++
++ // Given the overhead of calling the mailbox keep a stashed
++ // copy as we will almost certainly just want the same numbers again
++ // but that means we need a lock
++ pthread_mutex_lock(&sand_lock);
++
++ if (img.width != video_width || img.height != video_height)
++ {
++ VC_IMAGE_T new_img = {
++ .type = VC_IMAGE_YUV_UV,
++ .width = video_width,
++ .height = video_height
++ };
++
++ mbox_get_image_params(mbox_fd(), &new_img);
++ img = new_img;
++ }
++
++ geo.stride_y = stripe_w;
++ geo.stride_c = stripe_w;
++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++ geo.height_c = img.pitch / stripe_w - geo.height_y;
++ geo.stripe_is_yc = 1;
++ if (geo.height_y * stripe_w > img.pitch)
++ {
++ // "tall" sand - all C blocks now follow Y
++ geo.height_y = img.pitch / stripe_w;
++ geo.height_c = geo.height_y;
++ geo.stripe_is_yc = 0;
++ }
++ geo.planes_c = 1;
++ geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++ geo.bytes_per_pel = 1;
++
++ pthread_mutex_unlock(&sand_lock);
++#if 0
++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++ video_width, video_height,
++ geo.stride_y, geo.stride_c,
++ geo.height_y, geo.height_c,
++ geo.stripes, img.pitch);
++#endif
++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++ break;
++ }
++
++ case AV_PIX_FMT_RPI4_10:
++ {
++ const unsigned int stripe_w = 128; // bytes
++
++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++ static VC_IMAGE_T img = {0};
++
++ // Given the overhead of calling the mailbox keep a stashed
++ // copy as we will almost certainly just want the same numbers again
++ // but that means we need a lock
++ pthread_mutex_lock(&sand_lock);
++
++ if (img.width != video_width || img.height != video_height)
++ {
++ VC_IMAGE_T new_img = {
++ .type = VC_IMAGE_YUV10COL,
++ .width = video_width,
++ .height = video_height
++ };
++
++ mbox_get_image_params(mbox_fd(), &new_img);
++ img = new_img;
++ }
++
++ geo.stride_y = stripe_w;
++ geo.stride_c = stripe_w;
++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++ geo.height_c = img.pitch / stripe_w - geo.height_y;
++ geo.planes_c = 1;
++ geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
++ geo.bytes_per_pel = 1;
++ geo.stripe_is_yc = 1;
++
++ pthread_mutex_unlock(&sand_lock);
++
++#if 0
++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++ video_width, video_height,
++ geo.stride_y, geo.stride_c,
++ geo.height_y, geo.height_c,
++ geo.stripes, img.pitch);
++#endif
++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++ break;
++ }
++
++ case AV_PIX_FMT_SAND64_16:
++ case AV_PIX_FMT_SAND64_10:
++ {
++ const unsigned int stripe_w = 128; // bytes
++
++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++ static VC_IMAGE_T img = {0};
++
++ // Given the overhead of calling the mailbox keep a stashed
++ // copy as we will almost certainly just want the same numbers again
++ // but that means we need a lock
++ pthread_mutex_lock(&sand_lock);
++
++ if (img.width != video_width || img.height != video_height)
++ {
++ VC_IMAGE_T new_img = {
++ .type = VC_IMAGE_YUV_UV_16,
++ .width = video_width,
++ .height = video_height
++ };
++
++ mbox_get_image_params(mbox_fd(), &new_img);
++ img = new_img;
++ }
++
++ geo.stride_y = stripe_w;
++ geo.stride_c = stripe_w;
++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++ geo.height_c = img.pitch / stripe_w - geo.height_y;
++ geo.planes_c = 1;
++ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
++ geo.bytes_per_pel = 2;
++ geo.stripe_is_yc = 1;
++
++ pthread_mutex_unlock(&sand_lock);
++ break;
++ }
++
++ default:
++ break;
++ }
++ return geo;
++}
++
++//============================================================================
++//
++// ZC Env fns
++//
++// Frame copy fns
++
++static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
++ const AVFrame * const src)
++{
++ AVFrame dest_frame;
++ AVFrame * const dest = &dest_frame;
++ unsigned int i;
++ uint8_t * psrc, * pdest;
++
++ dest->format = src->format;
++ dest->width = src->width;
++ dest->height = src->height;
++
++ if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
++ av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
++ {
++ return NULL;
++ }
++
++ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
++ i != dest->height;
++ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
++ {
++ memcpy(pdest, psrc, dest->width);
++ }
++ for (i = 0, psrc = src->data[1], pdest = dest->data[1];
++ i != dest->height / 2;
++ ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
++ {
++ memcpy(pdest, psrc, dest->width / 2);
++ }
++ for (i = 0, psrc = src->data[2], pdest = dest->data[2];
++ i != dest->height / 2;
++ ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
++ {
++ memcpy(pdest, psrc, dest->width / 2);
++ }
++
++ return dest->buf[0];
++}
++
++
++static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
++ const AVFrame * const src)
++{
++ assert(0);
++ return NULL;
++}
++
++
++static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
++ const AVFrame * const src, const unsigned int src_bits)
++{
++ assert(0);
++ return NULL;
++}
++
++//----------------------------------------------------------------------------
++//
++// Public info extraction calls
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
++
++static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
++{
++ // Kludge where we check the free fn to check this is really
++ // one of our buffers - can't think of a better way
++ return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
++ av_buffer_get_opaque(buf);
++}
++
++static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
++{
++ // As gmem is the first el NULL should be preserved
++ return &pic_zbe_ptr(buf)->gmem;
++}
++
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref)
++{
++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++ return p == NULL ? 0 : p->vcsm_handle;
++}
++
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
++{
++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++ return p == NULL ? -1 : p->vc_handle;
++}
++
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++ return zbe == NULL ? 0 : zbe->offset;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++ return zbe == NULL ? 0 : zbe->size_pic;
++}
++
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
++{
++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++ return p == NULL ? 0 : p->numbytes;
++}
++
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
++{
++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++ return zbe == NULL ? NULL : &zbe->geo;
++}
++
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
++ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
++{
++ av_assert0(!maycopy || zc != NULL);
++
++ if (frame->format != AV_PIX_FMT_YUV420P &&
++ frame->format != AV_PIX_FMT_YUV420P10 &&
++ !av_rpi_is_sand_frame(frame))
++ {
++ av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
++ return NULL;
++ }
++
++ if (frame->buf[1] != NULL || frame->format != expected_format)
++ {
++#if RPI_ZC_SAND_8_IN_10_BUF
++ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
++ {
++// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
++ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
++ }
++#endif
++
++ if (maycopy)
++ {
++ if (frame->buf[1] != NULL)
++ av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
++ else
++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
++
++ switch (frame->format)
++ {
++ case AV_PIX_FMT_YUV420P10:
++ return zc_420p10_to_sand128(zc, frame);
++
++ case AV_PIX_FMT_SAND64_10:
++ return zc_sand64_16_to_sand128(zc, frame, 10);
++
++ default:
++ return zc_copy(zc, frame);
++ }
++ }
++ else
++ {
++ if (frame->buf[1] != NULL)
++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
++ else
++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
++ return NULL;
++ }
++ }
++
++ if (pic_gm_ptr(frame->buf[0]) == NULL)
++ {
++ if (maycopy)
++ {
++ av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
++ return zc_copy(zc, frame);
++ }
++ else
++ {
++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
++ return NULL;
++ }
++ }
++
++ return av_buffer_ref(frame->buf[0]);
++}
++
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
++{
++ if (fr_ref != NULL)
++ {
++ av_buffer_unref(&fr_ref);
++ }
++}
++
++//----------------------------------------------------------------------------
++
++// Extract user environment from an AVBufferRef
++void * av_rpi_zc_buf_v(AVBufferRef * const buf)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++ if (zbe != NULL && zbe->user != NULL)
++ {
++ const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
++ return zub == NULL ? NULL : zub->v;
++ }
++ return NULL;
++}
++
++// AV buffer pre-free callback
++static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
++{
++ if (opaque != NULL)
++ {
++ ZcUserBufEnv * const zub = opaque;
++
++ if (zub->fn->free)
++ zub->fn->free(zub->v);
++
++ av_free(zub);
++ }
++}
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
++{
++ if (opaque != NULL)
++ {
++ ZcBufEnv * const zbe = opaque;
++
++ av_buffer_unref(&zbe->user);
++
++ if (zbe->zc != NULL)
++ av_rpi_zc_env_release(zbe->zc);
++
++ pthread_cond_destroy(&zbe->cond);
++ pthread_mutex_destroy(&zbe->lock);
++ av_free(zbe);
++ }
++}
++
++
++// Wrap the various ZC bits in an AV Buffer and resolve those things we want
++// resolved now.
++// Currently we resolve everything, but in future we might not
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
++{
++ AVBufferRef *buf;
++ ZcUserBufEnv * zub;
++
++ if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
++ return NULL;
++
++ zub->fn = fn_tab;
++ zub->v = v;
++ zub->numbytes = numbytes;
++ zub->offset = addr_offset;
++
++ if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
++ av_free(zub);
++ return NULL;
++ }
++
++ return buf;
++}
++
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++
++ if (zbe == NULL)
++ return AVERROR(EINVAL);
++
++ if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
++ return AVERROR(EAGAIN);
++
++ if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
++ {
++ pthread_mutex_lock(&zbe->lock);
++ while (!zbe->is_valid)
++ pthread_cond_wait(&zbe->cond, &zbe->lock);
++ pthread_mutex_unlock(&zbe->lock);
++ }
++
++ if (zbe->is_valid == ZC_BUF_NEVER)
++ return AVERROR(EINVAL);
++
++ // Do alloc if we need it
++ if (zbe->user == NULL)
++ {
++ ZcEnv * const zc = zbe->zc;
++ const ZcUserBufEnv * zub;
++
++ av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
++
++ if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++ goto fail;
++ }
++ zub = (const ZcUserBufEnv *)zbe->user->data;
++
++ // Track
++
++ zbe->offset = zub->offset;
++ zbe->gmem.numbytes = zub->numbytes;
++ if ((zbe->gmem.arm = zub->fn->map_arm(zub->v)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++ goto fail;
++ }
++
++ if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
++ goto fail;
++ }
++
++ if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++ goto fail;
++ }
++ if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++ goto fail;
++ }
++
++ buf->buffer->data = zbe->gmem.arm + zbe->offset;
++ buf->buffer->size = zbe->size_pic;
++
++ // In this mode we shouldn't have anyone waiting for us
++ // so no need to signal
++ if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
++ zbe->is_valid = 1;
++ }
++
++ // Just overwrite - no point in testing
++ buf->data = zbe->gmem.arm + zbe->offset;
++ buf->size = zbe->size_pic;
++ return 0;
++
++fail:
++ av_buffer_unref(&zbe->user);
++ return AVERROR(ENOMEM);
++}
++
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
++{
++ int rv;
++
++ // Do alloc if we need it
++ if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
++ return rv;
++
++ // If we are a framebuf copy then the alloc can be done but we haven't
++ // imported its results yet
++ if (frame->data[0] == NULL)
++ {
++ const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++ frame->linesize[0] = zbe->geo.stride_y;
++ frame->linesize[1] = zbe->geo.stride_c;
++ frame->linesize[2] = zbe->geo.stride_c;
++ // abuse: linesize[3] = "stripe stride"
++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++ // In a general case this makes the calculation an xor and multiply rather
++ // than a divide and multiply
++ if (zbe->geo.stripes > 1)
++ frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
++
++ frame->data[0] = frame->buf[0]->data;
++ frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
++ if (zbe->geo.planes_c > 1)
++ frame->data[2] = frame->data[1] + zbe->size_c;
++
++ frame->extended_data = frame->data;
++ // Leave extended buf alone
++ }
++
++ return 0;
++}
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++ if (zbe == NULL)
++ return AVERROR(EINVAL);
++
++ zbe->is_valid = ZC_BUF_VALID;
++ pthread_cond_broadcast(&zbe->cond);
++
++ return 0;
++}
++
++int av_rpi_zc_set_broken_frame(AVFrame * const frame)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++ if (zbe == NULL)
++ return AVERROR(EINVAL);
++
++ zbe->is_valid = ZC_BUF_NEVER;
++ pthread_cond_broadcast(&zbe->cond);
++
++ return 0;
++}
++
++void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
++{
++ zc->pool_size = pool_size;
++}
++
++unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
++{
++ return zc->pool_size;
++}
++
++int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
++{
++#if 1
++ ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
++
++ for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++ frame->buf[i] = NULL;
++ frame->data[i] = NULL;
++ frame->linesize[i] = 0;
++ }
++
++ if (zbe == NULL)
++ return AVERROR(ENOMEM);
++
++ if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
++ {
++ av_free(zbe);
++ return AVERROR(ENOMEM);
++ }
++
++ pthread_mutex_init(&zbe->lock, NULL);
++ pthread_cond_init(&zbe->cond, NULL);
++ zbe->zc = zc;
++ atomic_fetch_add(&zc->refcount, 1);
++
++ zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); // Note geometry for later use
++ zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
++ zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
++ zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
++
++#else
++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
++ const unsigned int size_y = geo.stride_y * geo.height_y;
++ const unsigned int size_c = geo.stride_c * geo.height_c;
++ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
++ AVBufferRef * buf;
++ unsigned int i;
++
++// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
++
++ if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++ return AVERROR(ENOMEM);
++ }
++
++ // Track
++ atomic_fetch_add(&zc->refcount, 1);
++ pic_zbe_ptr(buf)->zc = zc;
++
++ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++ frame->buf[i] = NULL;
++ frame->data[i] = NULL;
++ frame->linesize[i] = 0;
++ }
++
++ frame->buf[0] = buf;
++
++ frame->linesize[0] = geo.stride_y;
++ frame->linesize[1] = geo.stride_c;
++ frame->linesize[2] = geo.stride_c;
++ // abuse: linesize[3] = "stripe stride"
++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++ // In a general case this makes the calculation an xor and multiply rather
++ // than a divide and multiply
++ if (geo.stripes > 1)
++ frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
++
++ frame->data[0] = buf->data;
++ frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
++ if (geo.planes_c > 1)
++ frame->data[2] = frame->data[1] + size_c;
++
++ frame->extended_data = frame->data;
++ // Leave extended buf alone
++
++#if RPI_ZC_SAND_8_IN_10_BUF != 0
++ // *** If we intend to use this for real we will want a 2nd buffer pool
++ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge
++#endif
++#endif
++
++ return 0;
++}
++
++void av_rpi_zc_env_release(const AVZcEnvPtr zc)
++{
++ const int n = atomic_fetch_add(&zc->refcount, -1);
++ if (n == 1) // was 1, now 0
++ {
++ zc->free_pool(zc->pool_env);
++ av_free(zc);
++ }
++}
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++ void * pool_env,
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++ ZcEnv * zc;
++
++ if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
++ {
++ av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
++ return NULL;
++ }
++
++ *zc = (ZcEnv){
++ .refcount = ATOMIC_VAR_INIT(1),
++ .pool_env = pool_env,
++ .alloc_buf = alloc_buf_fn,
++ .free_pool = free_pool_fn,
++ .pool_size = 0
++ };
++
++ return zc;
++}
++
++//============================================================================
++//
++// External ZC initialisation
++
++#define RPI_GET_BUFFER2 1
++
++
++static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
++{
++#if !RPI_GET_BUFFER2
++ return avcodec_default_get_buffer2(s, frame, flags);
++#else
++ int rv;
++
++ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
++ {
++// printf("Do default alloc: format=%#x\n", frame->format);
++ rv = avcodec_default_get_buffer2(s, frame, flags);
++ }
++ else if (frame->format == AV_PIX_FMT_YUV420P ||
++ av_rpi_is_sand_frame(frame))
++ {
++ if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
++ }
++ else
++ {
++ rv = avcodec_default_get_buffer2(s, frame, flags);
++ }
++
++#if 0
++ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
++ frame->format, frame->width, frame->height,
++ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
++ frame->data[0], frame->data[1], frame->data[2],
++ frame->buf[0], frame->buf[1], frame->buf[2],
++ av_buffer_get_opaque(frame->buf[0]));
++#endif
++ return rv;
++#endif
++}
++
++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
++{
++ return s->get_buffer2 == zc_get_buffer2;
++}
++
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++ void * pool_env,
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++ ZcEnv * zc;
++
++ av_assert0(!av_rpi_zc_in_use(s));
++
++ if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
++ return AVERROR(ENOMEM);
++
++ zc->old = (ZcOldCtxVals){
++ .opaque = s->opaque,
++ .get_buffer2 = s->get_buffer2,
++ .thread_safe_callbacks = s->thread_safe_callbacks
++ };
++
++ s->opaque = zc;
++ s->get_buffer2 = zc_get_buffer2;
++ s->thread_safe_callbacks = 1;
++ return 0;
++}
++
++void av_rpi_zc_uninit2(struct AVCodecContext * const s)
++{
++ ZcEnv * const zc = s->opaque;
++
++ av_assert0(av_rpi_zc_in_use(s));
++
++ s->get_buffer2 = zc->old.get_buffer2;
++ s->opaque = zc->old.opaque;
++ s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
++
++ av_rpi_zc_env_release(zc);
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_zc.h
+@@ -0,0 +1,228 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef LIBAVCODEC_RPI_ZC_H
++#define LIBAVCODEC_RPI_ZC_H
++
++// Zero-Copy frame code for RPi
++// RPi needs Y/U/V planes to be contiguous for display. By default
++// ffmpeg will allocate separated planes so a memcpy is needed before
++// display. This code provides a method a making ffmpeg allocate a single
++// bit of memory for the frame when can then be reference counted until
++// display has finished with it.
++
++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
++// 0 disables
++// *** This option still in development
++// Only works if SAO active
++// Allocates buffers that are twice the required size
++#define RPI_ZC_SAND_8_IN_10_BUF 0
++
++struct AVBufferRef;
++struct AVFrame;
++struct AVCodecContext;
++enum AVPixelFormat;
++
++// "Opaque" pointer to whatever we are using as a buffer reference
++typedef struct AVBufferRef * AVRpiZcRefPtr;
++
++struct AVZcEnv;
++typedef struct AVZcEnv * AVZcEnvPtr;
++
++typedef struct AVRpiZcFrameGeometry
++{
++ unsigned int stride_y; // Luma stride (bytes)
++ unsigned int height_y; // Luma height (lines)
++ unsigned int stride_c; // Chroma stride (bytes)
++ unsigned int height_c; // Chroma stride (lines)
++ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1)
++ unsigned int stripes; // Number of stripes (sand)
++ unsigned int bytes_per_pel;
++ int stripe_is_yc; // A single stripe is Y then C (false for tall sand)
++
++ int format; // Requested format
++ unsigned int video_width; // Requested width
++ unsigned int video_height; // Requested height
++} AVRpiZcFrameGeometry;
++
++// Get expected MMAL geometry for a given format, width & height
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++ const int format,
++ const unsigned int video_width, const unsigned int video_height);
++
++//----------------------------------------------------------------------------
++//
++// Calls that extract info from a ZC frame whether internally or externally
++// allocated
++
++// Generate a ZC reference to the buffer(s) in this frame
++// If the buffer doesn't appear to be one allocated by ZC
++// then the behaviour depends on maycopy:
++// If maycopy=0 then return NULL
++// If maycopy=1 && the src frame is in a form where we can easily copy
++// the data, then allocate a new buffer and copy the data into it
++// Otherwise return NULL
++// If maycopy == 0 then ZC may be NULL
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
++ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
++
++// Unreference the buffer refed/allocated by _zc_ref
++// If fr_ref is NULL then this will NOP
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
++
++// Get the vc_handle from the frame ref
++// Returns -1 if ref doesn't look valid
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get the vcsm_handle from the frame ref
++// Returns 0 if ref doesn't look valid
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
++// Get the number of bytes allocated from the frame ref
++// Returns 0 if ref doesn't look valid
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
++// Geometry this frame was allocated with
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
++
++//----------------------------------------------------------------------------
++//
++// Calls for external frame allocation
++
++// Callbacks registered in av_rpi_zc_init2
++
++// Callback to allocate a buf for a frame
++// The frame itself is generated in the calling code
++//
++// Parameters:
++// pool_env value passed to av-rpi_zc_init2
++// size size wanted
++// geo geometry of the frame to be allocated
++// Returns:
++// NULL Alloc failed
++// ptr AVBufferBuf* of allocated buffer
++// In most cases av_rpi_zc_buf will be called by this function
++// and this will be the buf returned by that.
++typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
++ const AVRpiZcFrameGeometry * geo);
++
++// Callback once ffmpeg is completely done with this pool
++// Called once all allocated buffers have been derefed and ffmpegs ref to this
++// pool has been dropped
++typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
++
++// Init ZC into a context
++// Sets opaque, get_buffer2, thread_safe_callbacks
++// Use if you want to allocate your own pools and/or create ZC buffers for
++// all decoders
++// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken
++// apart by av_rpi_zc_xxx calls without this
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++ void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn);
++
++// Free ZC from a context
++void av_rpi_zc_uninit2(struct AVCodecContext * const s);
++
++// Get minimum pool size in frames - valid by the time the first alloc request
++// occurs. Takes into account thread requests and DPB sizes derived from SPS
++// rather than just adding a worst case DPB size.
++unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
++
++typedef struct av_rpi_zc_buf_fn_tab_s {
++ // This AVBuffer is being freed by ffmpeg - return memory
++ // to external pool. Memory may be, but need not be, unmapped.
++ // v is the ptr passed in av_rpi_zc_buf
++ void (* free)(void * v);
++
++ // Return appropriate handles / mappings
++ // v is the ptr passed in av_rpi_zc_buf
++ unsigned int (* vcsm_handle)(void * v);
++ unsigned int (* vc_handle)(void * v);
++ void * (* map_arm)(void * v);
++ unsigned int (* map_vc)(void * v);
++} av_rpi_zc_buf_fn_tab_t;
++
++// Allocate a ZC AVBufferRef and set its callback table
++// Doesn't take a buffer address directly - relies on callbacks to return
++// addresses as they are required. Mappings need not be generated until
++// the map callbacks are called but they should persist from then until
++// the buffer is freed.
++//
++// Parameters:
++// numbytes Size of the buffer
++// addr_offset Offset to first usable byte of buffer (for alignment)
++// normally 0
++// v Pointer passed to callbacks
++// fn_tab Function table
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
++
++// Get v ptr set in in av_rpi_zc_buf
++void * av_rpi_zc_buf_v(AVBufferRef * const buf);
++
++//----------------------------------------------------------------------------
++//
++// Mostly internal calls but might possibly be wanted by outside code
++
++void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
++AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
++void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
++
++// Test to see if the context is using zc (checks get_buffer2)
++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
++
++// Get buffer generates placeholders for later alloc
++int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
++// Resolve actually does the alloc (noop if already alloced)
++// Set data pointers on a buffer/frame that was copied before the alloc
++// accured
++#define ZC_RESOLVE_FAIL 0 // return error on invalid
++#define ZC_RESOLVE_ALLOC 1 // alloc as invalid
++#define ZC_RESOLVE_WAIT_VALID 2 // wait for valid
++#define ZC_RESOLVE_ALLOC_VALID 3 // alloc as valid
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame);
++int av_rpi_zc_set_broken_frame(AVFrame * const frame);
++
++
++
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++ void * pool_env,
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn);
++void av_rpi_zc_env_release(const AVZcEnvPtr zc);
++
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_zc_frames.h
+@@ -0,0 +1,142 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_ZC_FRAMES_H
++#define RPI_ZC_FRAMES_H
++
++#define RPI_ONE_BUF 1
++
++#include "rpi_mem.h" // for GPU_MEM_PTR_T
++#include "libavutil/frame.h"
++
++#if !RPI_ONE_BUF
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++ return p->vc;
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++ return p->vc;
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++ return p->vc;
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++}
++
++#else
++
++static inline int gpu_is_buf1(const AVFrame * const frame)
++{
++ return frame->buf[1] == NULL;
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
++{
++ return av_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
++{
++ return av_buffer_pool_buffer_get_opaque(frame->buf[n]);
++}
++
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++ return gm->vc + (frame->data[n] - gm->arm);
++}
++
++
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++ return get_vc_address3(frame, 0);
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++ return get_vc_address3(frame, 1);
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++ return get_vc_address3(frame, 2);
++}
++
++#if 0
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.numbytes = frame->data[1] - frame->data[0];
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 0);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.arm += frame->data[1] - frame->data[0];
++ g.vc += frame->data[1] - frame->data[0];
++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 1);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.arm += frame->data[2] - frame->data[0];
++ g.vc += frame->data[2] - frame->data[0];
++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 2);
++}
++#endif
++#endif
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpivid_hevc.c
+@@ -0,0 +1,2128 @@
++// FFMPEG HEVC decoder hardware accelerator
++// Andrew Holme, Argon Design Ltd
++// Copyright (c) June 2017 Raspberry Pi Ltd
++
++#include <stdio.h>
++#include <fcntl.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <unistd.h>
++#include <sys/mman.h>
++
++#include "fftools/ffmpeg.h"
++#include "libavutil/avassert.h"
++#include "libavutil/imgutils.h"
++#include "avcodec.h"
++#include "hwconfig.h"
++#include "decode.h"
++
++#include "hevc.h"
++#include "hevcdec.h"
++#include "rpi_zc.h"
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++#include "rpi_mailbox.h"
++
++
++#define OPT_PHASE_TIMING 0 // Generate stats for phase usage
++
++#define OPT_EMU 0
++
++#define TRACE_DEV 0
++#define TRACE_ENTRY 0
++
++#define NUM_SCALING_FACTORS 4064
++
++#define AXI_BASE64 0
++
++#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
++#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
++
++#define RPIVID_COL_PICS 17 // 16 ref & current
++
++#define RPIVID_BITBUFS 2 // Bit + Cmd bufs (phase 0 & 1)
++#define RPIVID_BITBUF_SIZE (4 << 20) // Bit + Cmd buf size
++
++#define RPIVID_COEFFBUFS 3 // PU + Coeff bufs (phase 1 & 2)
++#define RPIVID_COEFFBUF_SIZE (16 << 20) // PU + Coeff buf size
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// Register offsets
++
++#define RPI_SPS0 0
++#define RPI_SPS1 4
++#define RPI_PPS 8
++#define RPI_SLICE 12
++#define RPI_TILESTART 16
++#define RPI_TILEEND 20
++#define RPI_SLICESTART 24
++#define RPI_MODE 28
++#define RPI_LEFT0 32
++#define RPI_LEFT1 36
++#define RPI_LEFT2 40
++#define RPI_LEFT3 44
++#define RPI_QP 48
++#define RPI_CONTROL 52
++#define RPI_STATUS 56
++#define RPI_VERSION 60
++#define RPI_BFBASE 64
++#define RPI_BFNUM 68
++#define RPI_BFCONTROL 72
++#define RPI_BFSTATUS 76
++#define RPI_PUWBASE 80
++#define RPI_PUWSTRIDE 84
++#define RPI_COEFFWBASE 88
++#define RPI_COEFFWSTRIDE 92
++#define RPI_SLICECMDS 96
++#define RPI_BEGINTILEEND 100
++#define RPI_TRANSFER 104
++#define RPI_CFBASE 108
++#define RPI_CFNUM 112
++#define RPI_CFSTATUS 116
++
++#define RPI_PURBASE 0x8000
++#define RPI_PURSTRIDE 0x8004
++#define RPI_COEFFRBASE 0x8008
++#define RPI_COEFFRSTRIDE 0x800C
++#define RPI_NUMROWS 0x8010
++#define RPI_CONFIG2 0x8014
++#define RPI_OUTYBASE 0x8018
++#define RPI_OUTYSTRIDE 0x801C
++#define RPI_OUTCBASE 0x8020
++#define RPI_OUTCSTRIDE 0x8024
++#define RPI_STATUS2 0x8028
++#define RPI_FRAMESIZE 0x802C
++#define RPI_MVBASE 0x8030
++#define RPI_MVSTRIDE 0x8034
++#define RPI_COLBASE 0x8038
++#define RPI_COLSTRIDE 0x803C
++#define RPI_CURRPOC 0x8040
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Unused but left here to illustrate the diffrences between FFmpegs prob
++// structure and the rpivid one
++
++struct FFM_PROB {
++ uint8_t sao_merge_flag [ 1];
++ uint8_t sao_type_idx [ 1];
++ uint8_t split_coding_unit_flag [ 3];
++ uint8_t cu_transquant_bypass_flag [ 1];
++ uint8_t skip_flag [ 3];
++ uint8_t cu_qp_delta [ 3];
++ uint8_t pred_mode_flag [ 1];
++ uint8_t part_mode [ 4];
++ uint8_t prev_intra_luma_pred_flag [ 1];
++ uint8_t intra_chroma_pred_mode [ 2];
++ uint8_t merge_flag [ 1];
++ uint8_t merge_idx [ 1];
++ uint8_t inter_pred_idc [ 5];
++ uint8_t ref_idx_l0 [ 2];
++ uint8_t ref_idx_l1 [ 2];
++ uint8_t abs_mvd_greater0_flag [ 2];
++ uint8_t abs_mvd_greater1_flag [ 2];
++ uint8_t mvp_lx_flag [ 1];
++ uint8_t no_residual_data_flag [ 1];
++ uint8_t split_transform_flag [ 3];
++ uint8_t cbf_luma [ 2];
++ uint8_t cbf_cb_cr [ 4];
++ uint8_t transform_skip_flag/*[][]*/ [ 2];
++ uint8_t explicit_rdpcm_flag/*[][]*/ [ 2];
++ uint8_t explicit_rdpcm_dir_flag/*[][]*/ [ 2];
++ uint8_t last_significant_coeff_x_prefix [18];
++ uint8_t last_significant_coeff_y_prefix [18];
++ uint8_t significant_coeff_group_flag [ 4];
++ uint8_t significant_coeff_flag [44];
++ uint8_t coeff_abs_level_greater1_flag [24];
++ uint8_t coeff_abs_level_greater2_flag [ 6];
++ uint8_t log2_res_scale_abs [ 8];
++ uint8_t res_scale_sign_flag [ 2];
++ uint8_t cu_chroma_qp_offset_flag [ 1];
++ uint8_t cu_chroma_qp_offset_idx [ 1];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_PROB {
++ uint8_t SAO_MERGE_FLAG [ 1];
++ uint8_t SAO_TYPE_IDX [ 1];
++ uint8_t SPLIT_FLAG [ 3];
++ uint8_t CU_SKIP_FLAG [ 3];
++ uint8_t CU_TRANSQUANT_BYPASS_FLAG [ 1];
++ uint8_t PRED_MODE [ 1];
++ uint8_t PART_SIZE [ 4];
++ uint8_t INTRA_PRED_MODE [ 1];
++ uint8_t CHROMA_PRED_MODE [ 1];
++ uint8_t MERGE_FLAG_EXT [ 1];
++ uint8_t MERGE_IDX_EXT [ 1];
++ uint8_t INTER_DIR [ 5];
++ uint8_t REF_PIC [ 2];
++ uint8_t MVP_IDX [ 1];
++ uint8_t MVD [ 2];
++ uint8_t QT_ROOT_CBF [ 1];
++ uint8_t TRANS_SUBDIV_FLAG [ 3];
++ uint8_t QT_CBF [ 6];
++ uint8_t DQP [ 2];
++ uint8_t ONE_FLAG [24];
++ uint8_t LASTX [18];
++ uint8_t LASTY [18];
++ uint8_t SIG_CG_FLAG [ 4];
++ uint8_t ABS_FLAG [ 6];
++ uint8_t TRANSFORMSKIP_FLAG [ 2];
++ uint8_t SIG_FLAG [42];
++ uint8_t SIG_FLAG_unused [ 2];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_CMD {
++ uint32_t addr;
++ uint32_t data;
++} __attribute__((packed));
++
++struct RPI_BIT {
++ int cmd;
++ const void *ptr;
++ int len;
++};
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_T;
++
++// Actual addressability is 38bits but we can only alloc in the bottom 32
++// currently - when passed to rpivid h/w the address is always >> 6 so will
++// fit in 32 bit there
++// At some point we may weant to make this uint64_t
++typedef uint32_t vid_vc_addr_t;
++
++typedef enum rpivid_decode_state_e {
++ RPIVID_DECODE_NEW = 0,
++ RPIVID_DECODE_START,
++ RPIVID_DECODE_SLICE,
++ RPIVID_DECODE_END,
++} rpivid_decode_state_t;
++
++#define RPI_PROB_VALS 154U
++#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
++
++typedef struct dec_env_s {
++ const AVCodecContext * avctx;
++
++ rpivid_decode_state_t state;
++ unsigned int decode_order;
++
++ int phase_no; // Current phase (i.e. the last one we waited for)
++ struct dec_env_s * phase_wait_q_next;
++ sem_t phase_wait;
++
++ struct RPI_BIT *bit_fifo;
++ struct RPI_CMD *cmd_fifo;
++ unsigned int bit_len, bit_max;
++ unsigned int cmd_len, cmd_max;
++ unsigned int num_slice_msgs;
++ unsigned int PicWidthInCtbsY;
++ unsigned int PicHeightInCtbsY;
++ unsigned int dpbno_col;
++ uint32_t reg_slicestart;
++ unsigned int wpp_entry_x;
++ unsigned int wpp_entry_y;
++
++ const uint8_t * nal_buffer;
++ size_t nal_size;
++
++ uint16_t slice_msgs[2*HEVC_MAX_REFS*8+3];
++ uint8_t scaling_factors[NUM_SCALING_FACTORS];
++// unsigned int RefPicList[2][HEVC_MAX_REFS];
++} dec_env_t;
++
++#define RPIVID_PHASES 3
++#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
++#define RPIVID_PHASE_START (-1) // Phase after we have inced decode_order
++
++#if OPT_PHASE_TIMING
++static const unsigned int time_thresholds[8] = {
++ 10, 15, 20, 30, 45, 60, 75, 90
++};
++#endif
++
++typedef struct phase_wait_env_s {
++ unsigned int last_order;
++ dec_env_t * q;
++#if OPT_PHASE_TIMING
++ uint64_t phase_time;
++ uint64_t max_phase_time;
++ uint64_t time_in_phase;
++ uint64_t time_out_phase;
++ unsigned int max_time_decode_order;
++ unsigned int time_bins[9];
++ unsigned int time_bins3[9];
++ unsigned int time_bins5[9];
++ uint64_t time_stash[16];
++ unsigned int i3;
++#endif
++} phase_wait_env_t; // Single linked list of threads waiting for this phase
++
++typedef struct RPI_T {
++ atomic_int ref_count;
++ sem_t ref_zero;
++
++ dec_env_t ** dec_envs;
++ AVZcEnvPtr zc;
++
++ pthread_mutex_t phase_lock;
++ phase_wait_env_t phase_reqs[RPIVID_PHASES];
++
++ volatile uint32_t * regs;
++ volatile uint32_t * ints;
++
++ GPU_MEM_PTR_T gcolbuf;
++ unsigned int col_stride;
++ size_t col_picsize;
++
++ unsigned int bitbuf_no;
++ sem_t bitbuf_sem;
++ GPU_MEM_PTR_T gbitbufs[RPIVID_BITBUFS];
++
++ unsigned int max_pu_msgs;
++ unsigned int coeffbuf_no;
++ sem_t coeffbuf_sem;
++ GPU_MEM_PTR_T gcoeffbufs[RPIVID_COEFFBUFS];
++
++ unsigned int decode_order;
++ int mbox_fd;
++ int gpu_init_type;
++} RPI_T;
++
++#if OPT_PHASE_TIMING
++static uint64_t tus64(void)
++{
++ struct timespec ts;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++#endif
++
++static inline unsigned int rnd64(unsigned int x)
++{
++ return (x + 63) & ~63;
++}
++
++static inline int rpi_sem_wait(sem_t * const sem)
++{
++ int rv;
++ while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
++ /* Loop */;
++ return rv;
++}
++
++//============================================================================
++
++#define REGS_NAME "/dev/rpivid-hevcmem"
++#define REGS_SIZE 0x10000
++#define INTS_NAME "/dev/rpivid-intcmem"
++#define INTS_SIZE 0x10000 // 4 is probably enough but we are going to alloc a page anyway
++
++static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
++{
++ void *gpio_map;
++ int mem_fd;
++
++ /* open /dev/mem */
++ if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
++ av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
++ return NULL;
++ }
++
++ // Now map it
++ gpio_map = mmap(
++ NULL,
++ size,
++ PROT_READ|PROT_WRITE,
++ MAP_SHARED,
++ mem_fd,
++ 0
++ );
++
++ close(mem_fd); // No longer need the FD
++
++ if (gpio_map == MAP_FAILED) {
++ av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
++ return NULL;
++ }
++
++ return (volatile uint32_t *)gpio_map;
++}
++
++static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
++{
++ volatile uint32_t * const gpio_map = *p_gpio_map;
++ if (gpio_map != NULL) {
++ *p_gpio_map = NULL;
++ munmap((void *)gpio_map, size);
++ }
++}
++
++#define MANGLE(x) ((x) &~0xc0000000) // ** If x is ever a 64 bit thing this will need fixing!
++#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
++
++static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
++{
++#if TRACE_DEV
++ printf("W %x %08x\n", addr, MANGLE64(data));
++#endif
++
++ rpi->regs[addr >> 2] = MANGLE64(data);
++}
++
++static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
++{
++#if TRACE_DEV
++ printf("W %x %08x\n", addr, data >> 6);
++#endif
++
++ rpi->regs[addr >> 2] = data >> 6; // ?? rnd64 - but not currently needed
++}
++
++static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
++{
++#if TRACE_DEV
++ printf("W %x %08x\n", addr, data);
++#endif
++
++ rpi->regs[addr >> 2] = data;
++}
++
++static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
++{
++ const uint32_t v = rpi->regs[addr >> 2];
++#if TRACE_DEV
++ printf("R %x (=%x)\n", addr, v);
++#endif
++ return v;
++}
++
++#define ARG_IC_ICTRL_ACTIVE1_INT_SET 0x00000001
++#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET 0x00000002
++#define ARG_IC_ICTRL_ACTIVE1_EN_SET 0x00000004
++#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET 0x00000008
++#define ARG_IC_ICTRL_ACTIVE2_INT_SET 0x00000010
++#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET 0x00000020
++#define ARG_IC_ICTRL_ACTIVE2_EN_SET 0x00000040
++#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET 0x00000080
++
++static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
++{
++ const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
++ const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
++ uint32_t ival;
++ while (((ival = rpi->ints[0]) & mask_done) == 0) {
++ usleep(1000);
++ }
++ rpi->ints[0] = ival & mask_reset;
++}
++
++#if TRACE_DEV && 0
++static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
++ int i;
++
++ for (i=0; i<num; i++)
++ {
++ if ((i%4)==0)
++ printf("%08x: ", 0x7eb00000 + addr + 4*i);
++
++ printf("%08x", rpi->regs[(addr>>2)+i]);
++
++ if ((i%4)==3 || i+1 == num)
++ printf("\n");
++ else
++ printf(" ");
++ }
++}
++
++static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
++ int i;
++
++ for (i=0; i<size>>2; i++)
++ {
++ if ((i%4)==0)
++ printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
++
++ printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
++
++ if ((i%4)==3 || i+1 == size>>2)
++ printf("\n");
++ else
++ printf(" ");
++ }
++}
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++
++static inline size_t round_up_size(const size_t x)
++{
++ /* Admit no size < 256 */
++ const unsigned int n = x < 256 ? 8 : av_log2(x) - 1;
++
++ return x >= (3 << n) ? 4 << n : (3 << n);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Scaling factors
++
++static void expand_scaling_list(
++ const unsigned int sizeID,
++ const unsigned int matrixID,
++ uint8_t * const dst0,
++ const uint8_t * const src0,
++ uint8_t dc)
++{
++ switch (sizeID) {
++ case 0:
++ memcpy(dst0, src0, 16);
++ break;
++ case 1:
++ memcpy(dst0, src0, 64);
++ break;
++ case 2:
++ {
++ uint8_t * d = dst0;
++ for (unsigned int y=0; y != 16; y++) {
++ const uint8_t * s = src0 + (y >> 1) * 8;
++ for (unsigned int x = 0; x != 8; ++x) {
++ *d++ = *s;
++ *d++ = *s++;
++ }
++ }
++ dst0[0] = dc;
++ break;
++ }
++ default:
++ {
++ uint8_t * d = dst0;
++ for (unsigned int y=0; y != 32; y++) {
++ const uint8_t * s = src0 + (y >> 2) * 8;
++ for (unsigned int x = 0; x != 8; ++x) {
++ *d++ = *s;
++ *d++ = *s;
++ *d++ = *s;
++ *d++ = *s++;
++ }
++ }
++ dst0[0] = dc;
++ break;
++ }
++ }
++}
++
++static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
++ // Array of constants for scaling factors
++ static const uint32_t scaling_factor_offsets[4][6] = {
++ // MID0 MID1 MID2 MID3 MID4 MID5
++ {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050}, // SID0 (4x4)
++ {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0}, // SID1 (8x8)
++ {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0}, // SID2 (16x16)
++ {0x07E0, 0, 0, 0x0BE0, 0, 0}}; // SID3 (32x32)
++
++ // ffmpeg places SID3,MID1 where matrixID 3 normally is
++ const ScalingList * const sl =
++ s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
++ : &s->ps.sps->scaling_list;
++ unsigned int mid;
++
++ for (mid=0; mid<6; mid++)
++ expand_scaling_list(0, mid,
++ de->scaling_factors + scaling_factor_offsets[0][mid],
++ sl->sl[0][mid], 0);
++ for (mid=0; mid<6; mid++)
++ expand_scaling_list(1, mid,
++ de->scaling_factors + scaling_factor_offsets[1][mid],
++ sl->sl[1][mid], 0);
++ for (mid=0; mid<6; mid++)
++ expand_scaling_list(2, mid,
++ de->scaling_factors + scaling_factor_offsets[2][mid],
++ sl->sl[2][mid],
++ sl->sl_dc[0][mid]);
++ // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
++ for (mid=0; mid<6; mid += 3)
++ expand_scaling_list(3, mid,
++ de->scaling_factors + scaling_factor_offsets[3][mid],
++ sl->sl[3][mid],
++ sl->sl_dc[1][mid]);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Probabilities
++
++static const uint8_t prob_init[3][156] = {
++ {
++ 153, 200, 139, 141, 157, 154, 154, 154,
++ 154, 154, 184, 154, 154, 154, 184, 63,
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ 154, 154, 154, 154, 154, 153, 138, 138,
++ 111, 141, 94, 138, 182, 154, 154, 154,
++ 140, 92, 137, 138, 140, 152, 138, 139,
++ 153, 74, 149, 92, 139, 107, 122, 152,
++ 140, 179, 166, 182, 140, 227, 122, 197,
++ 110, 110, 124, 125, 140, 153, 125, 127,
++ 140, 109, 111, 143, 127, 111, 79, 108,
++ 123, 63, 110, 110, 124, 125, 140, 153,
++ 125, 127, 140, 109, 111, 143, 127, 111,
++ 79, 108, 123, 63, 91, 171, 134, 141,
++ 138, 153, 136, 167, 152, 152, 139, 139,
++ 111, 111, 125, 110, 110, 94, 124, 108,
++ 124, 107, 125, 141, 179, 153, 125, 107,
++ 125, 141, 179, 153, 125, 107, 125, 141,
++ 179, 153, 125, 140, 139, 182, 182, 152,
++ 136, 152, 136, 153, 136, 139, 111, 136,
++ 139, 111, 0, 0, },
++ {
++ 153, 185, 107, 139, 126, 197, 185, 201,
++ 154, 149, 154, 139, 154, 154, 154, 152,
++ 110, 122, 95, 79, 63, 31, 31, 153,
++ 153, 168, 140, 198, 79, 124, 138, 94,
++ 153, 111, 149, 107, 167, 154, 154, 154,
++ 154, 196, 196, 167, 154, 152, 167, 182,
++ 182, 134, 149, 136, 153, 121, 136, 137,
++ 169, 194, 166, 167, 154, 167, 137, 182,
++ 125, 110, 94, 110, 95, 79, 125, 111,
++ 110, 78, 110, 111, 111, 95, 94, 108,
++ 123, 108, 125, 110, 94, 110, 95, 79,
++ 125, 111, 110, 78, 110, 111, 111, 95,
++ 94, 108, 123, 108, 121, 140, 61, 154,
++ 107, 167, 91, 122, 107, 167, 139, 139,
++ 155, 154, 139, 153, 139, 123, 123, 63,
++ 153, 166, 183, 140, 136, 153, 154, 166,
++ 183, 140, 136, 153, 154, 166, 183, 140,
++ 136, 153, 154, 170, 153, 123, 123, 107,
++ 121, 107, 121, 167, 151, 183, 140, 151,
++ 183, 140, 0, 0, },
++ {
++ 153, 160, 107, 139, 126, 197, 185, 201,
++ 154, 134, 154, 139, 154, 154, 183, 152,
++ 154, 137, 95, 79, 63, 31, 31, 153,
++ 153, 168, 169, 198, 79, 224, 167, 122,
++ 153, 111, 149, 92, 167, 154, 154, 154,
++ 154, 196, 167, 167, 154, 152, 167, 182,
++ 182, 134, 149, 136, 153, 121, 136, 122,
++ 169, 208, 166, 167, 154, 152, 167, 182,
++ 125, 110, 124, 110, 95, 94, 125, 111,
++ 111, 79, 125, 126, 111, 111, 79, 108,
++ 123, 93, 125, 110, 124, 110, 95, 94,
++ 125, 111, 111, 79, 125, 126, 111, 111,
++ 79, 108, 123, 93, 121, 140, 61, 154,
++ 107, 167, 91, 107, 107, 167, 139, 139,
++ 170, 154, 139, 153, 139, 123, 123, 63,
++ 124, 166, 183, 140, 136, 153, 154, 166,
++ 183, 140, 136, 153, 154, 166, 183, 140,
++ 136, 153, 154, 170, 153, 138, 138, 122,
++ 121, 122, 121, 167, 151, 183, 140, 151,
++ 183, 140, 0, 0, },
++};
++
++
++//////////////////////////////////////////////////////////////////////////////
++// Phase 1 command and bit FIFOs
++
++// ???? uint16_t addr - put in uint32_t
++static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
++ if (de->cmd_len==de->cmd_max)
++ av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
++
++#if TRACE_DEV
++ printf("[%02x] %x %x\n", de->cmd_len, addr, data);
++#endif
++
++ de->cmd_fifo[de->cmd_len].addr = addr;
++ de->cmd_fifo[de->cmd_len].data = data;
++ return de->cmd_len++;
++}
++
++static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
++ if (de->bit_len==de->bit_max)
++ av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
++ de->bit_fifo[de->bit_len].cmd = cmd_idx;
++ de->bit_fifo[de->bit_len].ptr = ptr;
++ de->bit_fifo[de->bit_len].len = len;
++ de->bit_len++;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write probability and scaling factor memories
++
++#if 0
++static void WriteProb(dec_env_t * const de) {
++ int i;
++ const uint8_t *p = (uint8_t *) &de->probabilities;
++ for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
++ p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++#endif
++
++static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
++ uint8_t dst[RPI_PROB_ARRAY_SIZE];
++
++ const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
++ s->sh.slice_type + 1 : 2 - s->sh.slice_type;
++ const uint8_t * p = prob_init[init_type];
++ const int q = av_clip(s->sh.slice_qp, 0, 51);
++ unsigned int i;
++
++ for (i = 0; i < RPI_PROB_VALS; i++) {
++ int init_value = p[i];
++ int m = (init_value >> 4) * 5 - 45;
++ int n = ((init_value & 15) << 3) - 16;
++ int pre = 2 * (((m * q) >> 4) + n) - 127;
++
++ pre ^= pre >> 31;
++ if (pre > 124)
++ pre = 124 + (pre & 1);
++ dst[i] = pre;
++ }
++ for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
++ dst[i] = 0;
++ }
++
++ for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
++ p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
++
++}
++
++
++static void WriteScalingFactors(dec_env_t * const de) {
++ int i;
++ const uint8_t *p = (uint8_t *) de->scaling_factors;
++ for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
++ p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
++ int i;
++ for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
++ return i-1;
++}
++
++static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
++ if (ctb < bd[num-1]) return ctb_size;
++ else if (width % ctb_size) return width % ctb_size;
++ else return ctb_size;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Handle PU and COEFF stream overflow
++
++
++// Returns:
++// -2 Other error
++// -1 Out of coeff space
++// 0 OK
++// 1 Out of PU space
++
++static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
++ uint32_t status;
++
++ // this is the definition of successful completion of phase 1
++ // it assures that status register is zero and all blocks in each tile have completed
++ if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
++ return 0;
++
++ status = apb_read(rpi, RPI_STATUS);
++
++ if ((status & 8) != 0)
++ return -1;
++
++ if ((status & 0x10) != 0)
++ return 1;
++
++ return -2;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write STATUS register with expected end CTU address of previous slice
++
++static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
++ const HEVCPPS * const pps = s->ps.pps;
++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++}
++
++static void wpp_pause(dec_env_t * const de, int ctb_row) {
++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
++ p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
++}
++
++static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++ const HEVCPPS *pps = s->ps.pps;
++ int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++ int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++ if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
++ wpp_pause(de, last_y);
++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++ if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
++{
++ const HEVCSPS *sps = s->ps.sps;
++ const HEVCPPS *pps = s->ps.pps;
++
++ p1_apb_write(de, RPI_SPS0,
++ (sps->log2_min_cb_size << 0) +
++ (sps->log2_ctb_size << 4) +
++ (sps->log2_min_tb_size << 8) +
++ (sps->log2_max_trafo_size << 12) +
++ (sps->bit_depth << 16) +
++ (sps->bit_depth << 20) +
++ (sps->max_transform_hierarchy_depth_intra << 24) +
++ (sps->max_transform_hierarchy_depth_inter << 28));
++
++ p1_apb_write(de, RPI_SPS1,
++ (sps->pcm.bit_depth << 0) +
++ (sps->pcm.bit_depth_chroma << 4) +
++ (sps->pcm.log2_min_pcm_cb_size << 8) +
++ (sps->pcm.log2_max_pcm_cb_size << 12) +
++ (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
++ (sps->amp_enabled_flag << 18) +
++ (sps->pcm_enabled_flag << 19) +
++ (sps->scaling_list_enable_flag << 20) +
++ (sps->sps_strong_intra_smoothing_enable_flag << 21));
++
++ p1_apb_write(de, RPI_PPS,
++ (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth << 0) +
++ (pps->cu_qp_delta_enabled_flag << 4) +
++ (pps->transquant_bypass_enable_flag << 5) +
++ (pps->transform_skip_enabled_flag << 6) +
++ (pps->sign_data_hiding_flag << 7) +
++ (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) << 8) +
++ (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
++ (pps->constrained_intra_pred_flag << 24));
++
++ if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
++
++ if (!s->sh.dependent_slice_segment_flag) {
++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++ int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++ de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
++ }
++
++ p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void write_slice(dec_env_t * const de, const HEVCContext * const s,
++ const unsigned int slice_w, const unsigned int slice_h) {
++ uint32_t u32 =
++ (s->sh.slice_type << 12)
++ + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
++ + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
++ + (slice_w << 17)
++ + (slice_h << 24);
++
++ if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
++ (s->sh.max_num_merge_cand << 0)
++ + (s->sh.nb_refs[L0] << 4)
++ + (s->sh.nb_refs[L1] << 8);
++
++ if (s->sh.slice_type==HEVC_SLICE_B)
++ u32 |= s->sh.mvd_l1_zero_flag<<16;
++ p1_apb_write(de, RPI_SLICE, u32);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
++ const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++ const HEVCSPS * const sps = s->ps.sps;
++ const HEVCPPS * const pps = s->ps.pps;
++
++ int ctb_size = 1<<sps->log2_ctb_size;
++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++
++ int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
++ int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
++
++ int endx = de->PicWidthInCtbsY-1;
++ int endy = ctb_row;
++
++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width, pps->col_bd, pps->num_tile_columns);
++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++ p1_apb_write(de, RPI_TILESTART, 0);
++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++ if (do_bte)
++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++ write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
++
++ if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
++ const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++ const HEVCSPS * const sps = s->ps.sps;
++ const HEVCPPS * const pps = s->ps.pps;
++
++ int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
++ int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
++
++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++
++ int endx = pps->col_bd[tile_x+1] - 1;
++ int endy = pps->row_bd[tile_y+1] - 1;
++
++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width, pps->col_bd, pps->num_tile_columns);
++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++ p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++ if (do_bte)
++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++ write_slice(de, s, slice_w, slice_h);
++
++ if (resetQPY)
++ p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++ p1_apb_write(de, RPI_MODE, (0xFFFF << 0)
++ + (0x0 << 16)
++ + ((tile_x==pps->num_tile_columns-1) << 17)
++ + ((tile_y==pps->num_tile_rows-1) << 18));
++
++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Doesn't attempt to remove from context as we should only do this at the end
++// of time or on create error
++static void
++dec_env_delete(dec_env_t * const de)
++{
++// gpu_free(&de->gbuf);
++
++ av_freep(&de->cmd_fifo);
++ av_freep(&de->bit_fifo);
++
++ sem_destroy(&de->phase_wait);
++ av_free(de);
++}
++
++static dec_env_t *
++dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++ dec_env_t * const de = av_mallocz(sizeof(*de));
++ int i;
++
++ if (de == NULL)
++ return NULL;
++
++ de->avctx = avctx;
++ de->phase_no = RPIVID_PHASE_NEW;
++
++ sem_init(&de->phase_wait, 0, 0);
++
++ if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
++ goto fail;
++
++ if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
++ goto fail;
++
++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++ for (i = 0; i != avctx->thread_count; ++i) {
++ if (rpi->dec_envs[i] == NULL)
++ {
++ rpi->dec_envs[i] = de;
++ break;
++ }
++ }
++ pthread_mutex_unlock(&rpi->phase_lock);
++
++ if (i == avctx->thread_count) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
++ goto fail;
++ }
++
++ return de;
++
++fail:
++ dec_env_delete(de);
++ return NULL;
++}
++
++
++static dec_env_t *
++dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++ dec_env_t * de = NULL;
++ const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
++
++ if (ref_count <= 0) {
++ // Already dead
++ av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
++ return NULL;
++ }
++
++ for (int i = 0; i != avctx->thread_count; ++i) {
++ if (rpi->dec_envs[i] == NULL)
++ {
++ de = dec_env_new(avctx, rpi);
++ break;
++ }
++ if (rpi->dec_envs[i]->avctx == avctx)
++ {
++ de = rpi->dec_envs[i];
++ break;
++ }
++ }
++ return de;
++}
++
++// Call at end of fn
++// Used to ensure we aren't in a worker thead when killed
++static void
++dec_env_release(RPI_T * const rpi, dec_env_t * const de)
++{
++ const int n = atomic_fetch_sub(&rpi->ref_count, 1);
++ if (n == 1) {
++ sem_post(&rpi->ref_zero);
++ }
++}
++
++//----------------------------------------------------------------------------
++
++// Wait for a slot in the given phase
++// Any error return is probably fatal
++static int
++wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++ int needs_wait = 0;
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++
++ pthread_mutex_lock(&rpi->phase_lock);
++ if (p->last_order + 1 != de->decode_order) {
++ de->phase_wait_q_next = p->q;
++ p->q = de;
++ needs_wait = 1;
++ }
++ pthread_mutex_unlock(&rpi->phase_lock);
++
++ if (needs_wait) {
++ while (sem_wait(&de->phase_wait) == -1)
++ {
++ int err;
++ if ((err = errno) != EINTR)
++ return AVERROR(err);
++ }
++ }
++
++ de->phase_no = phase_no;
++ return 0;
++}
++
++static void
++post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++ dec_env_t * next_de = NULL;
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++ dec_env_t ** q = &p->q;
++
++ pthread_mutex_lock(&rpi->phase_lock);
++
++ p->last_order = de->decode_order;
++ while (*q != NULL) {
++ dec_env_t * const t_de = *q;
++
++ if (t_de->decode_order == p->last_order + 1) {
++ // This is us - remove from Q
++ *q = t_de->phase_wait_q_next;
++ t_de->phase_wait_q_next = NULL; // Tidy
++ next_de = t_de;
++ break;
++ }
++ q = &t_de->phase_wait_q_next;
++ }
++
++ pthread_mutex_unlock(&rpi->phase_lock);
++
++ if (next_de != NULL)
++ sem_post(&next_de->phase_wait);
++}
++
++// Wait & signal stuff s.t. threads in other phases can continue
++static void
++abort_phases(RPI_T * const rpi, dec_env_t * const de)
++{
++ for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
++ wait_phase(rpi, de, i);
++ post_phase(rpi, de, i);
++ }
++ de->phase_no = RPIVID_PHASE_NEW;
++}
++
++// Start timing for phase
++// Stats only - no actual effect
++static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++ const int64_t now = tus64();
++ if (p->phase_time != 0)
++ p->time_out_phase += now - p->phase_time;
++ p->phase_time = now;
++#endif
++}
++
++#if OPT_PHASE_TIMING
++static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
++{
++ uint64_t tsum = 0;
++ unsigned int i;
++ for (i = 0; i != avg_n; ++i)
++ tsum += p->time_stash[(p->i3 - i) & 15];
++ for (i = 0; i != 9; ++i) {
++ if (time_thresholds[i] * 1000 * avg_n > tsum)
++ break;
++ }
++ return i;
++}
++#endif
++
++// End timing for phase
++// Stats only - no actual effect
++static inline void tend_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++ const uint64_t now = tus64();
++ const uint64_t in_time = now - p->phase_time;
++
++ p->time_in_phase += in_time;
++ p->phase_time = now;
++ p->time_stash[p->i3] = in_time;
++ if (in_time > p->max_phase_time) {
++ p->max_phase_time = in_time;
++ p->max_time_decode_order = p->last_order;
++ }
++ ++p->time_bins[tavg_bin_phase(p, 1)];
++ ++p->time_bins3[tavg_bin_phase(p, 3)];
++ ++p->time_bins5[tavg_bin_phase(p, 5)];
++
++ p->i3 = (p->i3 + 1) & 15;
++#endif
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Start frame
++
++static int rpi_hevc_start_frame(
++ AVCodecContext * avctx,
++ const uint8_t *buffer,
++ uint32_t size) {
++
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++ const HEVCContext * const s = avctx->priv_data;
++ const HEVCSPS * const sps = s->ps.sps;
++ const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
++
++#if TRACE_ENTRY
++ printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return -1;
++ }
++
++ de->phase_no = RPIVID_PHASE_START;
++ de->decode_order = ++rpi->decode_order; // *** atomic?
++
++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++ if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++ return -1;
++ }
++ de->state = RPIVID_DECODE_START;
++
++ de->PicWidthInCtbsY = (sps->width + CtbSizeY - 1) / CtbSizeY; //7-15
++ de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY; //7-17
++ de->bit_len = 0;
++ de->cmd_len = 0;
++
++#if TRACE_ENTRY
++ printf(">>> %s[%p]\n", __func__, de);
++#endif
++
++ dec_env_release(rpi, de);
++ return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Slice messages
++
++static void msg_slice(dec_env_t * const de, const uint16_t msg) {
++ de->slice_msgs[de->num_slice_msgs++] = msg;
++}
++
++static void program_slicecmds(dec_env_t * const de, const int sliceid) {
++ int i;
++ p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
++ for(i=0; i < de->num_slice_msgs; i++) {
++ p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
++ }
++}
++
++static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
++ const HEVCSPS * const sps = s->ps.sps;
++ const HEVCPPS * const pps = s->ps.pps;
++ const SliceHeader *sh = &s->sh;
++
++ int weightedPredFlag, i, rIdx;
++ uint16_t cmd_slice;
++ unsigned int collocated_from_l0_flag;
++
++ de->num_slice_msgs=0;
++ de->dpbno_col = 0;
++ cmd_slice = 0;
++ if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
++ if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
++ if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
++
++ if (sh->slice_type!=HEVC_SLICE_I) {
++ cmd_slice += sh->nb_refs[L0]<<2;
++ cmd_slice += sh->nb_refs[L1]<<6;
++ }
++
++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B)
++ cmd_slice |= sh->max_num_merge_cand<<11;
++
++ collocated_from_l0_flag =
++ !sh->slice_temporal_mvp_enabled_flag ?
++ 0 :
++ sh->slice_type == HEVC_SLICE_B ?
++ (sh->collocated_list == L0) :
++ (sh->slice_type==HEVC_SLICE_P);
++ cmd_slice |= collocated_from_l0_flag<<14;
++
++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
++
++ int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
++ for(i=L0; i<=L1; i++) {
++ for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++ HEVCFrame *c = s->ref; // CurrentPicture
++ if (c->poc < f->poc) NoBackwardPredFlag = 0;
++ }
++ }
++
++ if (sps->sps_temporal_mvp_enabled_flag)
++ {
++ const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
++ s->ref->refPicList + 0 :
++ s->ref->refPicList + 1;
++ de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
++ }
++
++ cmd_slice += NoBackwardPredFlag<<10;
++ msg_slice(de, cmd_slice);
++
++ // Write reference picture descriptions
++ weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
++
++ for(i=L0; i<=L1; i++)
++ for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++ HEVCFrame *c = s->ref; // CurrentPicture
++ int pic = f - s->DPB;
++ // Make sure pictures are in range 0 to 15
++ int adjusted_pic = f<c? pic : pic-1;
++ int lt = s->ref->refPicList[i].isLongTerm[rIdx];
++ msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
++ msg_slice(de, f->poc);
++ if (weightedPredFlag) {
++ msg_slice(de, s->sh.luma_log2_weight_denom+(((i?s-> sh.luma_weight_l1: s->sh.luma_weight_l0)[rIdx] &0x1ff)<<3));
++ msg_slice(de, (i?s-> sh.luma_offset_l1: s->sh.luma_offset_l0)[rIdx] & 0xff);
++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
++ }
++ }
++ }
++ else
++ msg_slice(de, cmd_slice);
++
++ msg_slice(de, ((sh->beta_offset/2)&15)
++ + (((sh->tc_offset/2)&15) << 4)
++ + (sh->disable_deblocking_filter_flag << 8)
++ + (sh->slice_loop_filter_across_slices_enabled_flag << 9)
++ + (pps->loop_filter_across_tiles_enabled_flag << 10)); // CMD_DEBLOCK
++
++ msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
++}
++
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++
++#if TRACE_ENTRY
++ printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return;
++ }
++
++ switch (de->state) {
++ case RPIVID_DECODE_NEW:
++ case RPIVID_DECODE_END:
++ // Expected transition
++ break;
++
++ case RPIVID_DECODE_SLICE:
++ // Error transition
++ av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
++ break;
++
++ case RPIVID_DECODE_START:
++ default:
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++ break;
++ }
++
++ abort_phases(rpi, de);
++ de->state = RPIVID_DECODE_NEW;
++
++ dec_env_release(rpi, de);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// End frame
++
++static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ const HEVCContext * const s = avctx->priv_data;
++ const HEVCPPS * const pps = s->ps.pps;
++ const HEVCSPS * const sps = s->ps.sps;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++ AVFrame * const f = s->ref->frame;
++ const unsigned int dpbno_cur = s->ref - s->DPB;
++ vid_vc_addr_t cmds_vc;
++ vid_vc_addr_t pu_base_vc;
++ unsigned int pu_stride;
++ vid_vc_addr_t coeff_base_vc;
++ unsigned int coeff_stride;
++ unsigned int i;
++ int rv = 0;
++ int status = 0;
++ int coeffbuf_sem_claimed = 0;
++
++#if TRACE_ENTRY
++ fprintf("<<< %s[%p]\n", __func__, de);
++#endif
++
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return AVERROR_BUG; // Should never happen
++ }
++
++ if (de->state != RPIVID_DECODE_SLICE) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++ rv = AVERROR_UNKNOWN;
++ goto fail;
++ }
++ de->state = RPIVID_DECODE_END;
++
++ // End of command compilation
++ {
++ const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
++ const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
++ if (pps->entropy_coding_sync_enabled_flag) {
++ if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
++ wpp_pause(de, last_y);
++ }
++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++ }
++
++ // Phase 0 ---------------------------------------------------------------
++
++ wait_phase(rpi, de, 0);
++ rpi_sem_wait(&rpi->bitbuf_sem);
++ tstart_phase(rpi, 0);
++
++ // Copy cmds & bits into gpu side buffer
++ // Layout: CMDS, BITS
++ {
++ uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
++ vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
++ unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
++
++ uint8_t * p = armbase + rnd64(cmd_bytes);
++ uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
++
++ cmds_vc = vcbase;
++
++ // Copy all the bits & update bitstream cmds to point at the right bits
++ for (i = 0; i < de->bit_len; ++i)
++ {
++ const unsigned int seg_len = de->bit_fifo[i].len;
++
++ if (p + seg_len > eobits) {
++ status = -1;
++ break;
++ }
++
++ memcpy(p, de->bit_fifo[i].ptr, seg_len);
++ de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
++
++ p += rnd64(seg_len);
++ }
++
++ memcpy(armbase, de->cmd_fifo, cmd_bytes);
++ }
++
++ if (status == 0)
++ {
++ if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
++ rpi->bitbuf_no = 0;
++ }
++ else
++ {
++ sem_post(&rpi->bitbuf_sem);
++ av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
++ rv = AVERROR_BUFFER_TOO_SMALL;
++ }
++
++ tend_phase(rpi, 0);
++ post_phase(rpi, de, 0);
++
++ if (status < 0)
++ goto fail;
++
++ // Phase 1 ---------------------------------------------------------------
++
++ wait_phase(rpi, de, 1);
++ rpi_sem_wait(&rpi->coeffbuf_sem);
++ coeffbuf_sem_claimed = 1;
++ tstart_phase(rpi, 1);
++
++ status = 0;
++ for (;;)
++ {
++ // (Re-)allocate PU/COEFF stream space
++ const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
++ unsigned int pu_size;
++
++ pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
++ pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
++ pu_size = pu_stride * de->PicHeightInCtbsY;
++
++ if (pu_size >= total_size || status == -1) {
++ GPU_MEM_PTR_T newbuf;
++
++ if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n");
++ status = -1;
++ break;
++ }
++ gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no);
++ rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf;
++ status = 0;
++ continue;
++ }
++
++ // Allocate all remaining space to coeff
++ coeff_base_vc = pu_base_vc + pu_size;
++ coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63; // Round down to multiple of 64
++
++ apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
++ apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
++ apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
++ apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
++
++ // Trigger command FIFO
++ apb_write(rpi, RPI_CFNUM, de->cmd_len);
++#if TRACE_DEV && 0
++ apb_dump_regs(rpi, 0x0, 32);
++ apb_dump_regs(rpi, 0x8000, 24);
++ axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
++#endif
++ apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
++
++ int_wait(rpi, 1);
++
++ status = check_status(rpi, de);
++
++ if (status == -1)
++ continue;
++ else if (status != 1)
++ break;
++
++ // Status 1 means out of PU space so try again with more
++ // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
++ rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
++ }
++
++ // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
++ // may reuse a live buffer when we kick the coeff sem
++ if (status == 0)
++ {
++ if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
++ rpi->coeffbuf_no = 0;
++ }
++ else
++ {
++ if (status == -1)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
++ rv = AVERROR_BUFFER_TOO_SMALL;
++ }
++ else
++ {
++ av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
++ rv = AVERROR_INVALIDDATA;
++ }
++ }
++
++ tend_phase(rpi, 1);
++ sem_post(&rpi->bitbuf_sem);
++ post_phase(rpi, de, 1);
++
++ if (status != 0)
++ goto fail;
++
++ // Phase 2 ---------------------------------------------------------------
++
++ wait_phase(rpi, de, 2);
++
++ if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
++ {
++ // As we are in phase 2 already here we don't need to worry about
++ // ceoffbuf_no despite the early exit
++ post_phase(rpi, de, 2);
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
++ goto fail;
++ }
++
++ tstart_phase(rpi, 2);
++
++ apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
++ apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
++ apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
++ apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
++
++ apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
++ apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
++ apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
++ apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
++
++ // Keep the last thing we resolved as fallback for any ref we fail to
++ // resolve. As a final fallback use our current frame. The pels might
++ // not be there yet but at least the memory is valid.
++ //
++ // Attempt to resolve the entire DPB - we could note what we have used
++ // in ref lists but probably simpler and more reliable to set the whole thing
++ {
++ AVFrame * fallback_frame = f;
++ for (i = 0; i != 16; ++i) {
++ // Avoid current frame
++ const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
++ AVFrame * fr = hevc_fr->frame;
++
++ if (fr != NULL &&
++ av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
++ {
++ fallback_frame = fr;
++ }
++ else
++ {
++ fr = fallback_frame;
++ }
++
++ apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
++ apb_write(rpi, 0x9004+16*i, 0);
++ apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
++ apb_write(rpi, 0x900C+16*i, 0);
++ }
++ }
++
++ apb_write(rpi, RPI_CONFIG2,
++ (sps->bit_depth << 0) // BitDepthY
++ + (sps->bit_depth << 4) // BitDepthC
++ + ((sps->bit_depth>8) << 8) // BitDepthY
++ + ((sps->bit_depth>8) << 9) // BitDepthC
++ + (sps->log2_ctb_size <<10)
++ + (pps->constrained_intra_pred_flag <<13)
++ + (sps->sps_strong_intra_smoothing_enable_flag<<14)
++ + (sps->sps_temporal_mvp_enabled_flag <<15)
++ + (pps->log2_parallel_merge_level <<16)
++ + (s->sh.slice_temporal_mvp_enabled_flag <<19)
++ + (sps->pcm.loop_filter_disable_flag <<20)
++ + ((pps->cb_qp_offset&31) <<21)
++ + ((pps->cr_qp_offset&31) <<26));
++
++ apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
++ apb_write(rpi, RPI_CURRPOC, s->poc);
++
++ // collocated reads/writes
++ if (sps->sps_temporal_mvp_enabled_flag) {
++ av_assert0(de->dpbno_col < RPIVID_COL_PICS);
++ av_assert0(dpbno_cur < RPIVID_COL_PICS);
++
++ apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
++ apb_write_vc_len(rpi, RPI_MVSTRIDE, rpi->col_stride);
++ apb_write_vc_addr(rpi, RPI_MVBASE, rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
++ apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
++ }
++
++#if TRACE_DEV && 0
++ apb_dump_regs(rpi, 0x0, 32);
++ apb_dump_regs(rpi, 0x8000, 24);
++#endif
++
++ apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
++ apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
++
++ int_wait(rpi, 2);
++
++ tend_phase(rpi, 2);
++ coeffbuf_sem_claimed = 0;
++ sem_post(&rpi->coeffbuf_sem);
++ // Set valid here to avoid race in resolving in any pending phase 2
++ av_rpi_zc_set_valid_frame(f);
++
++ post_phase(rpi, de, 2);
++
++ // Flush frame for CPU access
++ // Arguably the best place would be at the start of phase 2 but here
++ // will overlap with the wait
++ //
++ // * Even better would be to have better lock/unlock control in ZC for external access
++ if (rpi->gpu_init_type == GPU_INIT_GPU) // * CMA is currently always uncached
++ {
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++ rpi_cache_flush_finish(fe);
++ }
++
++#if TRACE_ENTRY
++ printf(">>> %s[%p] OK\n", __func__, de);
++#endif
++
++ dec_env_release(rpi, de);
++ return 0;
++
++fail:
++ av_rpi_zc_set_broken_frame(f);
++ if (coeffbuf_sem_claimed)
++ sem_post(&rpi->coeffbuf_sem);
++ abort_phases(rpi, de); // Dummy any unresolved phases
++
++#if TRACE_ENTRY
++ printf(">>> %s[%p] FAIL\n", __func__, de);
++#endif
++
++ dec_env_release(rpi, de);
++ return rv;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++
++#if TRACE_DEV
++static void dump_data(const uint8_t * p, size_t len)
++{
++ size_t i;
++ for (i = 0; i < len; i += 16) {
++ size_t j;
++ printf("%04x", i);
++ for (j = 0; j != 16; ++j) {
++ printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]);
++ }
++ printf("\n");
++ }
++}
++#endif
++
++#if OPT_EMU
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++ unsigned int z = 0;
++ while (idx--) {
++ if (*b++ == 0) {
++ ++z;
++ if (z >= 2 && *b == 3) {
++ ++b;
++ z = 0;
++ }
++ }
++ else {
++ z = 0;
++ }
++ }
++ return b;
++}
++#endif
++
++static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
++ const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes
++ const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
++ const GetBitContext *gb = &s->HEVClc->gb;
++
++#if OPT_EMU
++ const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1);
++ const int len = de->nal_size - (ptr - de->nal_buffer);
++#else
++ const int len = 1 + gb->size_in_bits/8 - gb->index/8;
++ const void *ptr = &gb->buffer[gb->index/8];
++#endif
++
++#if TRACE_DEV
++ printf("Index=%d, /8=%#x\n", gb->index, gb->index/8);
++ dump_data(de->nal_buffer, 128);
++#endif
++
++ p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
++ p1_apb_write(de, RPI_BFNUM, len);
++ p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
++ p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
++{
++ const HEVCPPS * const pps = s->ps.pps;
++
++ int i, resetQPY=1;
++ int indep = !s->sh.dependent_slice_segment_flag;
++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++
++ if (ctb_addr_ts)
++ wpp_end_previous_slice(de, s, ctb_addr_ts);
++ pre_slice_decode(de, s);
++ WriteBitstream(de, s);
++ if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
++ WriteProb(de, s);
++ else if (ctb_col==0)
++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++ else
++ resetQPY=0;
++ program_slicecmds(de, s->slice_idx);
++ new_slice_segment(de, s);
++ wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
++ for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++ int last_x = de->PicWidthInCtbsY-1;
++ if (de->PicWidthInCtbsY>2)
++ wpp_pause(de, ctb_row);
++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
++ if (de->PicWidthInCtbsY==2)
++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++ if (de->PicWidthInCtbsY==1)
++ WriteProb(de, s);
++ else
++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++ ctb_addr_ts += pps->column_width[0];
++ wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
++ }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++ const HEVCPPS * const pps = s->ps.pps;
++ int i, resetQPY;
++
++ if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
++ pre_slice_decode(de, s);
++ WriteBitstream(de, s);
++ resetQPY = ctb_addr_ts==0
++ || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
++ || !s->sh.dependent_slice_segment_flag;
++ if (resetQPY) WriteProb(de, s);
++ program_slicecmds(de, s->slice_idx);
++ new_slice_segment(de, s);
++ new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
++ for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++ int last_x = pps->col_bd[tile_x+1]-1;
++ int last_y = pps->row_bd[tile_y+1]-1;
++ p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
++ WriteProb(de, s);
++ ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
++ new_entry_point(de, s, 0, 1, ctb_addr_ts);
++ }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int cabac_start_align(HEVCContext *s)
++{
++ GetBitContext *gb = &s->HEVClc->gb;
++ skip_bits(gb, 1);
++ align_get_bits(gb);
++ // Should look at getting rid of this
++ return ff_init_cabac_decoder(&s->HEVClc->cc,
++ gb->buffer + get_bits_count(gb) / 8,
++ (get_bits_left(gb) + 7) / 8);
++}
++
++static int rpi_hevc_decode_slice(
++ AVCodecContext *avctx,
++ const uint8_t *buffer,
++ uint32_t size)
++{
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ HEVCContext * const s = avctx->priv_data;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++ const HEVCPPS *pps = s->ps.pps;
++ int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++
++#if TRACE_ENTRY
++ printf("<<< %s[%p]\n", __func__, de);
++#endif
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return -1;
++ }
++
++ if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++ return -1;
++ }
++ de->state = RPIVID_DECODE_SLICE;
++
++ de->nal_buffer = buffer;
++ de->nal_size = size;
++
++#if !OPT_EMU
++// ff_hevc_cabac_init(s, ctb_addr_ts);
++ cabac_start_align(s);
++#endif
++ if (s->ps.sps->scaling_list_enable_flag)
++ populate_scaling_factors(de, s);
++ pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
++ : decode_slice(de, s, ctb_addr_ts);
++#if TRACE_ENTRY
++ printf(">>> %s[%p]\n", __func__, de);
++#endif
++ dec_env_release(rpi, de);
++ return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
++{
++ int rv;
++ if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
++ av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
++ return rv;
++}
++
++static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ HEVCContext * const s = avctx->priv_data;
++ // Frame buffering + 1 output. Would need thread_count extra but we now
++ // alloc at the start of phase 2 so that is the only thread we need the
++ // extra buffer for.
++ const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
++ int rv;
++
++ if (av_rpi_zc_in_use(avctx))
++ {
++ const AVZcEnvPtr zc = avctx->opaque;
++ av_rpi_zc_set_decoder_pool_size(zc, pool_req);
++ rv = av_rpi_zc_get_buffer(zc, frame); // get_buffer2 would alloc
++ }
++ else
++ {
++ if (rpi->zc == NULL) {
++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++ // Alloc inside lock to make sure we only ever alloc one
++ if (rpi->zc == NULL) {
++ rpi->zc = av_rpi_zc_int_env_alloc(s);
++ }
++ pthread_mutex_unlock(&rpi->phase_lock);
++ }
++ av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
++ rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
++ av_rpi_zc_get_buffer(rpi->zc, frame);
++ }
++
++ if (rv == 0 &&
++ (rv = ff_attach_decode_data(frame)) < 0)
++ {
++ av_frame_unref(frame);
++ }
++
++ if (rv == 0)
++ {
++ FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
++ fdd->post_process = rpivid_retrieve_data;
++ }
++
++ return rv;
++}
++
++#if OPT_PHASE_TIMING
++static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
++{
++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
++ bins[0], bins[1], bins[2], bins[3],
++ bins[4], bins[5], bins[6], bins[7], bins[8]);
++}
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_free(AVCodecContext *avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++
++#if TRACE_ENTRY
++ printf("<<< %s\n", __func__);
++#endif
++
++ dec_env_release(rpi, NULL);
++
++ // Wait for everything else to stop
++ {
++ struct timespec tt;
++ clock_gettime(CLOCK_REALTIME, &tt);
++ tt.tv_sec += 2;
++ while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
++ const int err = errno;
++ if (err == ETIMEDOUT) {
++ av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
++ return -1;
++ }
++ if (err != EINTR) {
++ av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
++ break;
++ }
++ }
++ }
++
++#if OPT_PHASE_TIMING
++ {
++ unsigned int i;
++ for (i = 0; i != RPIVID_PHASES; ++i) {
++ const phase_wait_env_t * const p = rpi->phase_reqs + i;
++ av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
++ (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
++ (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d >\n",
++ time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
++ time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
++ log_bin_phase(avctx, p->time_bins);
++ log_bin_phase(avctx, p->time_bins3);
++ log_bin_phase(avctx, p->time_bins5);
++ av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
++ (unsigned int)(p->max_phase_time / 1000),
++ p->max_time_decode_order);
++ }
++ av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
++ }
++#endif
++
++ if (rpi->dec_envs != NULL)
++ {
++ for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
++ dec_env_delete(rpi->dec_envs[i]);
++ }
++ av_freep(&rpi->dec_envs);
++ }
++
++ av_rpi_zc_int_env_freep(&rpi->zc);
++
++ gpu_free(&rpi->gcolbuf);
++
++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++ gpu_free(rpi->gbitbufs + i);
++ }
++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++ gpu_free(rpi->gcoeffbufs + i);
++ }
++
++ unmap_devp(&rpi->regs, REGS_SIZE);
++ unmap_devp(&rpi->ints, INTS_SIZE);
++
++ if (rpi->gpu_init_type > 0)
++ rpi_mem_gpu_uninit();
++
++ if (rpi->mbox_fd >= 0) {
++ mbox_release_clock(rpi->mbox_fd);
++ mbox_close(rpi->mbox_fd);
++ }
++
++ sem_destroy(&rpi->ref_zero);
++ sem_destroy(&rpi->coeffbuf_sem);
++ sem_destroy(&rpi->bitbuf_sem);
++
++#if TRACE_ENTRY
++ printf(">>> %s\n", __func__);
++#endif
++ return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_init(AVCodecContext *avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++// const char *err;
++
++#if TRACE_ENTRY
++ printf("<<< %s\n", __func__);
++#endif
++
++ if (avctx->width>4096 || avctx->height>4096) {
++ av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
++ return AVERROR(ENOTSUP);
++ }
++
++ memset(rpi, 0, sizeof(*rpi));
++
++ rpi->mbox_fd = -1;
++ rpi->decode_order = 0;
++
++ // Initial PU/COEFF stream buffer split chosen as worst case seen so far
++ rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
++
++
++ atomic_store(&rpi->ref_count, 1);
++ sem_init(&rpi->ref_zero, 0, 0);
++
++ sem_init(&rpi->bitbuf_sem, 0, RPIVID_BITBUFS);
++ sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
++
++ pthread_mutex_init(&rpi->phase_lock, NULL);
++
++ if ((rpi->mbox_fd = mbox_open()) < 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
++ goto fail;
++ }
++ mbox_request_clock(rpi->mbox_fd);
++
++ if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
++ (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
++ goto fail;
++ }
++
++ if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
++ goto fail;
++ }
++
++ if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
++ goto fail;
++ }
++
++ rpi->col_stride = rnd64(avctx->width);
++ rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
++ if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
++ goto fail;
++ }
++
++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++ if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
++ goto fail;
++ }
++ }
++
++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++ if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
++ goto fail;
++ }
++ }
++
++ av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n");
++
++ return 0;
++
++fail:
++ rpi_hevc_free(avctx);
++ return AVERROR_EXTERNAL;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
++ .name = "hevc_rpi4_8",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .pix_fmt = AV_PIX_FMT_RPI4_8,
++ .alloc_frame = rpivid_hevc_alloc_frame,
++ .start_frame = rpi_hevc_start_frame,
++ .end_frame = rpi_hevc_end_frame,
++ .abort_frame = rpi_hevc_abort_frame,
++ .decode_slice = rpi_hevc_decode_slice,
++ .init = rpi_hevc_init,
++ .uninit = rpi_hevc_free,
++ .priv_data_size = sizeof(RPI_T),
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
++ .name = "hevc_rpi4_10",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .pix_fmt = AV_PIX_FMT_RPI4_10,
++ .alloc_frame = rpivid_hevc_alloc_frame,
++ .start_frame = rpi_hevc_start_frame,
++ .end_frame = rpi_hevc_end_frame,
++ .abort_frame = rpi_hevc_abort_frame,
++ .decode_slice = rpi_hevc_decode_slice,
++ .init = rpi_hevc_init,
++ .uninit = rpi_hevc_free,
++ .priv_data_size = sizeof(RPI_T),
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -21,6 +21,7 @@
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
++#include <drm_fourcc.h>
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <sys/mman.h>
+@@ -29,57 +30,82 @@
+ #include <poll.h>
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
++#include "libavutil/avassert.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext.h"
+ #include "v4l2_context.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_m2m.h"
++#include "weak_link.h"
+
+ #define USEC_PER_SEC 1000000
+-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
++static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+
+-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
+ {
+ return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
+ container_of(buf->context, V4L2m2mContext, output) :
+ container_of(buf->context, V4L2m2mContext, capture);
+ }
+
+-static inline AVCodecContext *logger(V4L2Buffer *buf)
++static inline AVCodecContext *logger(const V4L2Buffer * const buf)
+ {
+ return buf_to_m2mctx(buf)->avctx;
+ }
+
+-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
+ {
+- V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+-
+- if (s->avctx->pkt_timebase.num)
+- return s->avctx->pkt_timebase;
+- return s->avctx->time_base;
++ const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++ const AVRational tb = s->avctx->pkt_timebase.num ?
++ s->avctx->pkt_timebase :
++ s->avctx->time_base;
++ return tb.num && tb.den ? tb : v4l2_timebase;
+ }
+
+-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
++static inline struct timeval tv_from_int(const int64_t t)
+ {
+- int64_t v4l2_pts;
++ return (struct timeval){
++ .tv_usec = t % USEC_PER_SEC,
++ .tv_sec = t / USEC_PER_SEC
++ };
++}
+
+- if (pts == AV_NOPTS_VALUE)
+- pts = 0;
++static inline int64_t int_from_tv(const struct timeval t)
++{
++ return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
++}
+
++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++{
+ /* convert pts to v4l2 timebase */
+- v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+- out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+- out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
++ const int64_t v4l2_pts =
++ pts == AV_NOPTS_VALUE ? 0 :
++ av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
++ out->buf.timestamp = tv_from_int(v4l2_pts);
+ }
+
+-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
+ {
+- int64_t v4l2_pts;
+-
++ const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
++ return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
++#if 0
+ /* convert pts back to encoder timebase */
+- v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
+- avbuf->buf.timestamp.tv_usec;
++ return
++ avbuf->context->no_pts_rescale ? v4l2_pts :
++ v4l2_pts == 0 ? AV_NOPTS_VALUE :
++ av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++#endif
++}
+
+- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
++{
++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++ out->planes[plane].bytesused = bytesused;
++ out->planes[plane].length = length;
++ } else {
++ out->buf.bytesused = bytesused;
++ out->buf.length = length;
++ }
+ }
+
+ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+@@ -116,49 +142,176 @@ static enum AVColorPrimaries v4l2_get_co
+ return AVCOL_PRI_UNSPECIFIED;
+ }
+
+-static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+-{
+- enum v4l2_quantization qt;
++static void v4l2_set_color(V4L2Buffer *buf,
++ const enum AVColorPrimaries avcp,
++ const enum AVColorSpace avcs,
++ const enum AVColorTransferCharacteristic avxc)
++{
++ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
++ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
++ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
++
++ switch (avcp) {
++ case AVCOL_PRI_BT709:
++ cs = V4L2_COLORSPACE_REC709;
++ ycbcr = V4L2_YCBCR_ENC_709;
++ break;
++ case AVCOL_PRI_BT470M:
++ cs = V4L2_COLORSPACE_470_SYSTEM_M;
++ ycbcr = V4L2_YCBCR_ENC_601;
++ break;
++ case AVCOL_PRI_BT470BG:
++ cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++ break;
++ case AVCOL_PRI_SMPTE170M:
++ cs = V4L2_COLORSPACE_SMPTE170M;
++ break;
++ case AVCOL_PRI_SMPTE240M:
++ cs = V4L2_COLORSPACE_SMPTE240M;
++ break;
++ case AVCOL_PRI_BT2020:
++ cs = V4L2_COLORSPACE_BT2020;
++ break;
++ case AVCOL_PRI_SMPTE428:
++ case AVCOL_PRI_SMPTE431:
++ case AVCOL_PRI_SMPTE432:
++ case AVCOL_PRI_EBU3213:
++ case AVCOL_PRI_RESERVED:
++ case AVCOL_PRI_FILM:
++ case AVCOL_PRI_UNSPECIFIED:
++ default:
++ break;
++ }
+
+- qt = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+- buf->context->format.fmt.pix_mp.quantization :
+- buf->context->format.fmt.pix.quantization;
++ switch (avcs) {
++ case AVCOL_SPC_RGB:
++ cs = V4L2_COLORSPACE_SRGB;
++ break;
++ case AVCOL_SPC_BT709:
++ cs = V4L2_COLORSPACE_REC709;
++ break;
++ case AVCOL_SPC_FCC:
++ cs = V4L2_COLORSPACE_470_SYSTEM_M;
++ break;
++ case AVCOL_SPC_BT470BG:
++ cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++ break;
++ case AVCOL_SPC_SMPTE170M:
++ cs = V4L2_COLORSPACE_SMPTE170M;
++ break;
++ case AVCOL_SPC_SMPTE240M:
++ cs = V4L2_COLORSPACE_SMPTE240M;
++ break;
++ case AVCOL_SPC_BT2020_CL:
++ cs = V4L2_COLORSPACE_BT2020;
++ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
++ break;
++ case AVCOL_SPC_BT2020_NCL:
++ cs = V4L2_COLORSPACE_BT2020;
++ break;
++ default:
++ break;
++ }
+
+- switch (qt) {
+- case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
+- case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
++ switch (xfer) {
++ case AVCOL_TRC_BT709:
++ xfer = V4L2_XFER_FUNC_709;
++ break;
++ case AVCOL_TRC_IEC61966_2_1:
++ xfer = V4L2_XFER_FUNC_SRGB;
++ break;
++ case AVCOL_TRC_SMPTE240M:
++ xfer = V4L2_XFER_FUNC_SMPTE240M;
++ break;
++ case AVCOL_TRC_SMPTE2084:
++ xfer = V4L2_XFER_FUNC_SMPTE2084;
++ break;
+ default:
+ break;
+ }
+
+- return AVCOL_RANGE_UNSPECIFIED;
++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++ buf->context->format.fmt.pix_mp.colorspace = cs;
++ buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
++ buf->context->format.fmt.pix_mp.xfer_func = xfer;
++ } else {
++ buf->context->format.fmt.pix.colorspace = cs;
++ buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
++ buf->context->format.fmt.pix.xfer_func = xfer;
++ }
+ }
+
+-static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
++static inline enum v4l2_quantization
++buf_quantization(const V4L2Buffer * const buf)
+ {
+- enum v4l2_ycbcr_encoding ycbcr;
+- enum v4l2_colorspace cs;
++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++ buf->context->format.fmt.pix_mp.quantization :
++ buf->context->format.fmt.pix.quantization;
++}
+
+- cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++static inline enum v4l2_colorspace
++buf_colorspace(const V4L2Buffer * const buf)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+ buf->context->format.fmt.pix_mp.colorspace :
+ buf->context->format.fmt.pix.colorspace;
++}
+
+- ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++static inline enum v4l2_ycbcr_encoding
++buf_ycbcr_enc(const V4L2Buffer * const buf)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+ buf->context->format.fmt.pix_mp.ycbcr_enc:
+ buf->context->format.fmt.pix.ycbcr_enc;
++}
+
+- switch(cs) {
+- case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
++static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
++{
++ switch (buf_quantization(buf)) {
++ case V4L2_QUANTIZATION_LIM_RANGE:
++ return AVCOL_RANGE_MPEG;
++ case V4L2_QUANTIZATION_FULL_RANGE:
++ return AVCOL_RANGE_JPEG;
++ case V4L2_QUANTIZATION_DEFAULT:
++ // If YUV (which we assume for all video decode) then, from the header
++ // comments, range is limited unless CS is JPEG
++ return buf_colorspace(buf) == V4L2_COLORSPACE_JPEG ?
++ AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
++ default:
++ break;
++ }
++
++ return AVCOL_RANGE_UNSPECIFIED;
++}
++
++static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
++{
++ const enum v4l2_quantization q =
++ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
++ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
++ V4L2_QUANTIZATION_DEFAULT;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++ buf->context->format.fmt.pix_mp.quantization = q;
++ } else {
++ buf->context->format.fmt.pix.quantization = q;
++ }
++}
++
++static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
++{
++ switch (buf_colorspace(buf)) {
++ case V4L2_COLORSPACE_JPEG: // JPEG -> SRGB
++ case V4L2_COLORSPACE_SRGB:
++ return AVCOL_SPC_RGB;
+ case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
+ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
+ case V4L2_COLORSPACE_BT2020:
+- if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
+- return AVCOL_SPC_BT2020_CL;
+- else
+- return AVCOL_SPC_BT2020_NCL;
++ return buf_ycbcr_enc(buf) == V4L2_YCBCR_ENC_BT2020_CONST_LUM ?
++ AVCOL_SPC_BT2020_CL : AVCOL_SPC_BT2020_NCL;
+ default:
+ break;
+ }
+@@ -168,17 +321,9 @@ static enum AVColorSpace v4l2_get_color_
+
+ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
+ {
+- enum v4l2_ycbcr_encoding ycbcr;
++ const enum v4l2_ycbcr_encoding ycbcr = buf_ycbcr_enc(buf);
+ enum v4l2_xfer_func xfer;
+- enum v4l2_colorspace cs;
+-
+- cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+- buf->context->format.fmt.pix_mp.colorspace :
+- buf->context->format.fmt.pix.colorspace;
+-
+- ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+- buf->context->format.fmt.pix_mp.ycbcr_enc:
+- buf->context->format.fmt.pix.ycbcr_enc;
++ const enum v4l2_colorspace cs = buf_colorspace(buf);
+
+ xfer = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+ buf->context->format.fmt.pix_mp.xfer_func:
+@@ -210,73 +355,165 @@ static enum AVColorTransferCharacteristi
+ return AVCOL_TRC_UNSPECIFIED;
+ }
+
+-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
+ {
+- V4L2Buffer* avbuf = opaque;
+- V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+-
+- if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
+- atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
++ return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
++}
+
+- if (s->reinit) {
+- if (!atomic_load(&s->refcount))
+- sem_post(&s->refsync);
+- } else {
+- if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
+- /* no need to queue more buffers to the driver */
+- avbuf->status = V4L2BUF_AVAILABLE;
+- }
+- else if (avbuf->context->streamon)
+- ff_v4l2_buffer_enqueue(avbuf);
+- }
++static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
++{
++ return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
++}
+
+- av_buffer_unref(&avbuf->context_ref);
+- }
++static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
++{
++ buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
++ is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
+ }
+
+-static int v4l2_buf_increase_ref(V4L2Buffer *in)
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
+ {
+- V4L2m2mContext *s = buf_to_m2mctx(in);
++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++ AVDRMLayerDescriptor *layer;
+
+- if (in->context_ref)
+- atomic_fetch_add(&in->context_refcount, 1);
+- else {
+- in->context_ref = av_buffer_ref(s->self_ref);
+- if (!in->context_ref)
+- return AVERROR(ENOMEM);
++ /* fill the DRM frame descriptor */
++ drm_desc->nb_objects = avbuf->num_planes;
++ drm_desc->nb_layers = 1;
+
+- in->context_refcount = 1;
++ layer = &drm_desc->layers[0];
++ layer->nb_planes = avbuf->num_planes;
++
++ for (int i = 0; i < avbuf->num_planes; i++) {
++ layer->planes[i].object_index = i;
++ layer->planes[i].offset = 0;
++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+ }
+
+- in->status = V4L2BUF_RET_USER;
+- atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
++ switch (avbuf->context->av_pix_fmt) {
++ case AV_PIX_FMT_YUYV422:
++
++ layer->format = DRM_FORMAT_YUYV;
++ layer->nb_planes = 1;
+
+- return 0;
++ break;
++
++ case AV_PIX_FMT_NV12:
++ case AV_PIX_FMT_NV21:
++
++ layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
++ DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
++
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 2;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ avbuf->context->format.fmt.pix.height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++ break;
++
++ case AV_PIX_FMT_YUV420P:
++
++ layer->format = DRM_FORMAT_YUV420;
++
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 3;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ avbuf->context->format.fmt.pix.height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++ layer->planes[2].object_index = 0;
++ layer->planes[2].offset = layer->planes[1].offset +
++ ((avbuf->plane_info[0].bytesperline *
++ avbuf->context->format.fmt.pix.height) >> 2);
++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++ break;
++
++ default:
++ drm_desc->nb_layers = 0;
++ break;
++ }
++
++ return (uint8_t *) drm_desc;
+ }
+
+-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
++static void v4l2_free_bufref(void *opaque, uint8_t *data)
+ {
+- int ret;
++ AVBufferRef * bufref = (AVBufferRef *)data;
++ V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
++ struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
+
+- if (plane >= in->num_planes)
+- return AVERROR(EINVAL);
++ if (ctx != NULL) {
++ // Buffer still attached to context
++ V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+
+- /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
+- *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
+- in->plane_info[plane].length, v4l2_free_buffer, in, 0);
+- if (!*buf)
+- return AVERROR(ENOMEM);
++ ff_mutex_lock(&ctx->lock);
+
+- ret = v4l2_buf_increase_ref(in);
+- if (ret)
+- av_buffer_unref(buf);
++ ff_v4l2_buffer_set_avail(avbuf);
+
+- return ret;
++ if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
++ /* no need to queue more buffers to the driver */
++ }
++ else if (ctx->streamon) {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
++ avbuf->buf.timestamp.tv_sec = 0;
++ avbuf->buf.timestamp.tv_usec = 0;
++ ff_v4l2_buffer_enqueue(avbuf); // will set to IN_DRIVER
++ }
++ else {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
++ }
++
++ ff_mutex_unlock(&ctx->lock);
++ }
++
++ ff_weak_link_unlock(avbuf->context_wl);
++ av_buffer_unref(&bufref);
+ }
+
+-static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
++{
++ struct v4l2_exportbuffer expbuf;
++ int i, ret;
++
++ for (i = 0; i < avbuf->num_planes; i++) {
++ memset(&expbuf, 0, sizeof(expbuf));
++
++ expbuf.index = avbuf->buf.index;
++ expbuf.type = avbuf->buf.type;
++ expbuf.plane = i;
++
++ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
++ if (ret < 0)
++ return AVERROR(errno);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
++ /* drm frame */
++ avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
++ avbuf->drm_frame.objects[i].fd = expbuf.fd;
++ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ } else {
++ /* drm frame */
++ avbuf->drm_frame.objects[0].size = avbuf->buf.length;
++ avbuf->drm_frame.objects[0].fd = expbuf.fd;
++ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ }
++ }
++
++ return 0;
++}
++
++static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
+ {
+ unsigned int bytesused, length;
++ int rv = 0;
+
+ if (plane >= out->num_planes)
+ return AVERROR(EINVAL);
+@@ -284,32 +521,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
+ length = out->plane_info[plane].length;
+ bytesused = FFMIN(size+offset, length);
+
+- memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
+-
+- if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+- out->planes[plane].bytesused = bytesused;
+- out->planes[plane].length = length;
+- } else {
+- out->buf.bytesused = bytesused;
+- out->buf.length = length;
++ if (size > length - offset) {
++ size = length - offset;
++ rv = AVERROR(ENOMEM);
+ }
+
+- return 0;
++ memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
++
++ set_buf_length(out, plane, bytesused, length);
++
++ return rv;
++}
++
++static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
++{
++ AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
++ AVBufferRef * newbuf;
++
++ if (!bufref)
++ return NULL;
++
++ newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
++ if (newbuf == NULL)
++ av_buffer_unref(&bufref);
++
++ avbuf->status = V4L2BUF_RET_USER;
++ return newbuf;
+ }
+
+ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+- int i, ret;
++ int i;
+
+ frame->format = avbuf->context->av_pix_fmt;
+
+- for (i = 0; i < avbuf->num_planes; i++) {
+- ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
+- if (ret)
+- return ret;
++ frame->buf[0] = wrap_avbuf(avbuf);
++ if (frame->buf[0] == NULL)
++ return AVERROR(ENOMEM);
++
++ if (buf_to_m2mctx(avbuf)->output_drm) {
++ /* 1. get references to the actual data */
++ frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
++ return 0;
++ }
+
++
++ /* 1. get references to the actual data */
++ for (i = 0; i < avbuf->num_planes; i++) {
++ frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
+ frame->linesize[i] = avbuf->plane_info[i].bytesperline;
+- frame->data[i] = frame->buf[i]->data;
+ }
+
+ /* fixup special cases */
+@@ -318,17 +580,17 @@ static int v4l2_buffer_buf_to_swframe(AV
+ case AV_PIX_FMT_NV21:
+ if (avbuf->num_planes > 1)
+ break;
+- frame->linesize[1] = avbuf->plane_info[0].bytesperline;
+- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
++ frame->linesize[1] = frame->linesize[0];
++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
+ break;
+
+ case AV_PIX_FMT_YUV420P:
+ if (avbuf->num_planes > 1)
+ break;
+- frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
+- frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
+- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+- frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
++ frame->linesize[1] = frame->linesize[0] / 2;
++ frame->linesize[2] = frame->linesize[1];
++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
++ frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
+ break;
+
+ default:
+@@ -338,68 +600,127 @@ static int v4l2_buffer_buf_to_swframe(AV
+ return 0;
+ }
+
++static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
++{
++ if (dst_stride == src_stride && w + 32 >= dst_stride) {
++ memcpy(dst, src, dst_stride * h);
++ }
++ else {
++ while (--h >= 0) {
++ memcpy(dst, src, w);
++ dst += dst_stride;
++ src += src_stride;
++ }
++ }
++}
++
++static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
++{
++ return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
++}
++
++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++{
++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++ if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
++ return AVERROR(EINVAL);
++
++ av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++ // Only currently cope with single buffer types
++ if (out->buf.length != 1)
++ return AVERROR_PATCHWELCOME;
++ if (src->nb_objects != 1)
++ return AVERROR(EINVAL);
++
++ out->planes[0].m.fd = src->objects[0].fd;
++ }
++ else {
++ if (src->nb_objects != 1)
++ return AVERROR(EINVAL);
++
++ out->buf.m.fd = src->objects[0].fd;
++ }
++
++ // No need to copy src AVDescriptor and if we did then we may confuse
++ // fd close on free
++ out->ref_buf = av_buffer_ref(frame->buf[0]);
++
++ return 0;
++}
++
+ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+ {
+- int i, ret;
+- struct v4l2_format fmt = out->context->format;
+- int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+- fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
+- int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+- fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
+- int is_planar_format = 0;
+-
+- switch (pixel_format) {
+- case V4L2_PIX_FMT_YUV420M:
+- case V4L2_PIX_FMT_YVU420M:
+-#ifdef V4L2_PIX_FMT_YUV422M
+- case V4L2_PIX_FMT_YUV422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU422M
+- case V4L2_PIX_FMT_YVU422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YUV444M
+- case V4L2_PIX_FMT_YUV444M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU444M
+- case V4L2_PIX_FMT_YVU444M:
+-#endif
+- case V4L2_PIX_FMT_NV12M:
+- case V4L2_PIX_FMT_NV21M:
+- case V4L2_PIX_FMT_NV12MT_16X16:
+- case V4L2_PIX_FMT_NV12MT:
+- case V4L2_PIX_FMT_NV16M:
+- case V4L2_PIX_FMT_NV61M:
+- is_planar_format = 1;
+- }
+-
+- if (!is_planar_format) {
+- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+- int planes_nb = 0;
+- int offset = 0;
+-
+- for (i = 0; i < desc->nb_components; i++)
+- planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+-
+- for (i = 0; i < planes_nb; i++) {
+- int size, h = height;
+- if (i == 1 || i == 2) {
++ int i;
++ int num_planes = 0;
++ int pel_strides[4] = {0};
++
++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++
++ if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
++ av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
++ return -1;
++ }
++
++ for (i = 0; i != desc->nb_components; ++i) {
++ if (desc->comp[i].plane >= num_planes)
++ num_planes = desc->comp[i].plane + 1;
++ pel_strides[desc->comp[i].plane] = desc->comp[i].step;
++ }
++
++ if (out->num_planes > 1) {
++ if (num_planes != out->num_planes) {
++ av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
++ return -1;
++ }
++ for (i = 0; i != num_planes; ++i) {
++ int w = frame->width;
++ int h = frame->height;
++ if (is_chroma(desc, i, num_planes)) {
++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
+ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
+ }
+- size = frame->linesize[i] * h;
+- ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset, frame->buf[i]);
+- if (ret)
+- return ret;
+- offset += size;
++
++ cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
++ frame->data[i], frame->linesize[i],
++ w * pel_strides[i], h);
++ set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
+ }
+- return 0;
+ }
++ else
++ {
++ unsigned int offset = 0;
++
++ for (i = 0; i != num_planes; ++i) {
++ int w = frame->width;
++ int h = frame->height;
++ int dst_stride = out->plane_info[0].bytesperline;
++ uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
++
++ if (is_chroma(desc, i, num_planes)) {
++ // Is chroma
++ dst_stride >>= desc->log2_chroma_w;
++ offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
++ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
++ }
++ else {
++ // Is luma or alpha
++ offset += dst_stride * out->context->height;
++ }
++ if (offset > out->plane_info[0].length) {
++ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
++ return -1;
++ }
+
+- for (i = 0; i < out->num_planes; i++) {
+- ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0, frame->buf[i]);
+- if (ret)
+- return ret;
++ cpy_2d(dst, dst_stride,
++ frame->data[i], frame->linesize[i],
++ w * pel_strides[i], h);
++ }
++ set_buf_length(out, 0, offset, out->plane_info[0].length);
+ }
+-
+ return 0;
+ }
+
+@@ -409,16 +730,31 @@ static int v4l2_buffer_swframe_to_buf(co
+ *
+ ******************************************************************************/
+
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
+ {
+- v4l2_set_pts(out, frame->pts);
+-
+- return v4l2_buffer_swframe_to_buf(frame, out);
++ out->buf.flags = frame->key_frame ?
++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
++ // Beware that colour info is held in format rather than the actual
++ // v4l2 buffer struct so this may not be as useful as you might hope
++ v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
++ v4l2_set_color_range(out, frame->color_range);
++ // PTS & interlace are buffer vars
++ if (track_ts)
++ out->buf.timestamp = tv_from_int(track_ts);
++ else
++ v4l2_set_pts(out, frame->pts);
++ v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
++
++ return frame->format == AV_PIX_FMT_DRM_PRIME ?
++ v4l2_buffer_primeframe_to_buf(frame, out) :
++ v4l2_buffer_swframe_to_buf(frame, out);
+ }
+
+ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+ int ret;
++ V4L2Context * const ctx = avbuf->context;
+
+ av_frame_unref(frame);
+
+@@ -429,17 +765,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+
+ /* 2. get frame information */
+ frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
++ frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
++ (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
++ (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
++ AV_PICTURE_TYPE_NONE;
+ frame->color_primaries = v4l2_get_color_primaries(avbuf);
+ frame->colorspace = v4l2_get_color_space(avbuf);
+ frame->color_range = v4l2_get_color_range(avbuf);
+ frame->color_trc = v4l2_get_color_trc(avbuf);
+ frame->pts = v4l2_get_pts(avbuf);
+ frame->pkt_dts = AV_NOPTS_VALUE;
++ frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
++ frame->top_field_first = v4l2_buf_is_top_first(avbuf);
+
+ /* these values are updated also during re-init in v4l2_process_driver_event */
+- frame->height = avbuf->context->height;
+- frame->width = avbuf->context->width;
+- frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
++ frame->height = ctx->height;
++ frame->width = ctx->width;
++ frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
++
++ if (ctx->selection.height && ctx->selection.width) {
++ frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
++ frame->crop_top = ctx->selection.top < frame->height ? ctx->selection.top : 0;
++ frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
++ frame->width - (ctx->selection.left + ctx->selection.width) : 0;
++ frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
++ frame->height - (ctx->selection.top + ctx->selection.height) : 0;
++ }
+
+ /* 3. report errors upstream */
+ if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
+@@ -452,15 +803,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+
+ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+ {
+- int ret;
+-
+ av_packet_unref(pkt);
+- ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
+- if (ret)
+- return ret;
++
++ pkt->buf = wrap_avbuf(avbuf);
++ if (pkt->buf == NULL)
++ return AVERROR(ENOMEM);
+
+ pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
+- pkt->data = pkt->buf->data;
++ pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
++ pkt->flags = 0;
+
+ if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
+ pkt->flags |= AV_PKT_FLAG_KEY;
+@@ -475,31 +826,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
+ return 0;
+ }
+
+-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++ const void *extdata, size_t extlen,
++ const int64_t timestamp)
+ {
+ int ret;
+
+- ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0, pkt->buf);
+- if (ret)
++ if (extlen) {
++ ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
++ if (ret)
++ return ret;
++ }
++
++ ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
++ if (ret && ret != AVERROR(ENOMEM))
+ return ret;
+
+- v4l2_set_pts(out, pkt->pts);
++ if (timestamp)
++ out->buf.timestamp = tv_from_int(timestamp);
++ else
++ v4l2_set_pts(out, pkt->pts);
++
++ out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
+
+- if (pkt->flags & AV_PKT_FLAG_KEY)
+- out->flags = V4L2_BUF_FLAG_KEYFRAME;
++ return ret;
++}
+
+- return 0;
++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++{
++ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
++}
++
++
++static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
++{
++ V4L2Buffer * const avbuf = (V4L2Buffer *)data;
++ int i;
++
++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
++ struct V4L2Plane_info *p = avbuf->plane_info + i;
++ if (p->mm_addr != NULL)
++ munmap(p->mm_addr, p->length);
++ }
++
++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++ if (avbuf->drm_frame.objects[i].fd != -1)
++ close(avbuf->drm_frame.objects[i].fd);
++ }
++
++ av_buffer_unref(&avbuf->ref_buf);
++
++ ff_weak_link_unref(&avbuf->context_wl);
++
++ av_free(avbuf);
+ }
+
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
++
++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
+ {
+- V4L2Context *ctx = avbuf->context;
+ int ret, i;
++ V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
++ AVBufferRef * bufref;
+
+- avbuf->buf.memory = V4L2_MEMORY_MMAP;
++ *pbufref = NULL;
++ if (avbuf == NULL)
++ return AVERROR(ENOMEM);
++
++ bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
++ if (bufref == NULL) {
++ av_free(avbuf);
++ return AVERROR(ENOMEM);
++ }
++
++ avbuf->context = ctx;
++ avbuf->buf.memory = mem;
+ avbuf->buf.type = ctx->type;
+ avbuf->buf.index = index;
+
++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++ avbuf->drm_frame.objects[i].fd = -1;
++ }
++
++ avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
++
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->buf.length = VIDEO_MAX_PLANES;
+ avbuf->buf.m.planes = avbuf->planes;
+@@ -507,7 +918,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+
+ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+ if (ret < 0)
+- return AVERROR(errno);
++ goto fail;
+
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->num_planes = 0;
+@@ -520,6 +931,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ avbuf->num_planes = 1;
+
+ for (i = 0; i < avbuf->num_planes; i++) {
++ const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
++ (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
+
+ avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+ ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+@@ -527,25 +940,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+- PROT_READ | PROT_WRITE, MAP_SHARED,
+- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
++
++ if (want_mmap)
++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
++ PROT_READ | PROT_WRITE, MAP_SHARED,
++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+ } else {
+ avbuf->plane_info[i].length = avbuf->buf.length;
+- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+- PROT_READ | PROT_WRITE, MAP_SHARED,
+- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
++
++ if (want_mmap)
++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
++ PROT_READ | PROT_WRITE, MAP_SHARED,
++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+ }
+
+- if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
+- return AVERROR(ENOMEM);
++ if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
++ avbuf->plane_info[i].mm_addr = NULL;
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
+ }
+
+ avbuf->status = V4L2BUF_AVAILABLE;
+
+- if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+- return 0;
+-
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->buf.m.planes = avbuf->planes;
+ avbuf->buf.length = avbuf->num_planes;
+@@ -555,20 +972,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ avbuf->buf.length = avbuf->planes[0].length;
+ }
+
+- return ff_v4l2_buffer_enqueue(avbuf);
++ if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++ if (buf_to_m2mctx(avbuf)->output_drm) {
++ ret = v4l2_buffer_export_drm(avbuf);
++ if (ret)
++ goto fail;
++ }
++ }
++
++ *pbufref = bufref;
++ return 0;
++
++fail:
++ av_buffer_unref(&bufref);
++ return ret;
+ }
+
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
+ {
+ int ret;
++ int qc;
+
+- avbuf->buf.flags = avbuf->flags;
++ if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++ avbuf->context->name, avbuf->buf.index,
++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
++ avbuf->context->q_count);
++ }
+
+ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
+- if (ret < 0)
+- return AVERROR(errno);
++ if (ret < 0) {
++ int err = errno;
++ av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
++ avbuf->context->name, avbuf->buf.index,
++ err, strerror(err));
++ return AVERROR(err);
++ }
+
++ // Lock not wanted - if called from buffer free then lock already obtained
++ qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+ avbuf->status = V4L2BUF_IN_DRIVER;
++ pthread_cond_broadcast(&avbuf->context->cond);
++
++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++ avbuf->context->name, avbuf->buf.index,
++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
+
+ return 0;
+ }
+--- a/libavcodec/v4l2_buffers.h
++++ b/libavcodec/v4l2_buffers.h
+@@ -27,25 +27,38 @@
+ #include <stdatomic.h>
+ #include <linux/videodev2.h>
+
++#include "libavutil/hwcontext_drm.h"
+ #include "avcodec.h"
+
+ enum V4L2Buffer_status {
+ V4L2BUF_AVAILABLE,
+ V4L2BUF_IN_DRIVER,
++ V4L2BUF_IN_USE,
+ V4L2BUF_RET_USER,
+ };
+
+ /**
+ * V4L2Buffer (wrapper for v4l2_buffer management)
+ */
++struct V4L2Context;
++struct ff_weak_link_client;
++
+ typedef struct V4L2Buffer {
+- /* each buffer needs to have a reference to its context */
++ /* each buffer needs to have a reference to its context
++ * The pointer is good enough for most operation but once the buffer has
++ * been passed to the user the buffer may become orphaned so for free ops
++ * the weak link must be used to ensure that the context is actually
++ * there
++ */
+ struct V4L2Context *context;
++ struct ff_weak_link_client *context_wl;
+
+- /* This object is refcounted per-plane, so we need to keep track
+- * of how many context-refs we are holding. */
+- AVBufferRef *context_ref;
+- atomic_uint context_refcount;
++ /* DRM descriptor */
++ AVDRMFrameDescriptor drm_frame;
++ /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
++ * are done
++ */
++ AVBufferRef * ref_buf;
+
+ /* keep track of the mmap address and mmap length */
+ struct V4L2Plane_info {
+@@ -60,7 +73,6 @@ typedef struct V4L2Buffer {
+ struct v4l2_buffer buf;
+ struct v4l2_plane planes[VIDEO_MAX_PLANES];
+
+- int flags;
+ enum V4L2Buffer_status status;
+
+ } V4L2Buffer;
+@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
+ */
+ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++ const void *extdata, size_t extlen,
++ const int64_t timestamp);
++
+ /**
+ * Extracts the data from an AVFrame to a V4L2Buffer
+ *
+@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AV
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
+
+ /**
+ * Initializes a V4L2Buffer
+@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
+
+ /**
+ * Enqueues a V4L2Buffer
+@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ */
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+
++static inline void
++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
++{
++ avbuf->status = V4L2BUF_AVAILABLE;
++ av_buffer_unref(&avbuf->ref_buf);
++}
++
+
+ #endif // AVCODEC_V4L2_BUFFERS_H
+--- a/libavcodec/v4l2_context.c
++++ b/libavcodec/v4l2_context.c
+@@ -27,11 +27,13 @@
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <poll.h>
++#include "libavutil/avassert.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
++#include "weak_link.h"
+
+ struct v4l2_format_update {
+ uint32_t v4l2_fmt;
+@@ -41,26 +43,168 @@ struct v4l2_format_update {
+ int update_avfmt;
+ };
+
+-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
++
++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+ {
+- return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+- container_of(ctx, V4L2m2mContext, output) :
+- container_of(ctx, V4L2m2mContext, capture);
++ return (int64_t)n;
+ }
+
+-static inline AVCodecContext *logger(V4L2Context *ctx)
++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+ {
+- return ctx_to_m2mctx(ctx)->avctx;
++ return (unsigned int)pts;
+ }
+
+-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
++// FFmpeg requires us to propagate a number of vars from the coded pkt into
++// the decoded frame. The only thing that tracks like that in V4L2 stateful
++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
++// guarantees about PTS being unique or specified for every frame so replace
++// the supplied PTS with a simple incrementing number and keep a circular
++// buffer of all the things we want preserved (including the original PTS)
++// indexed by the tracking no.
++static int64_t
++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
+ {
+- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++ int64_t track_pts;
++
++ // Avoid 0
++ if (++x->track_no == 0)
++ x->track_no = 1;
++
++ track_pts = track_to_pts(avctx, x->track_no);
++
++ av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++ .discard = 0,
++ .pending = 1,
++ .pkt_size = avpkt->size,
++ .pts = avpkt->pts,
++ .dts = avpkt->dts,
++ .reordered_opaque = avctx->reordered_opaque,
++ .pkt_pos = avpkt->pos,
++ .pkt_duration = avpkt->duration,
++ .track_pts = track_pts
++ };
++ return track_pts;
+ }
+
+-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
++static int64_t
++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
+ {
+- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++ int64_t track_pts;
++
++ // Avoid 0
++ if (++x->track_no == 0)
++ x->track_no = 1;
++
++ track_pts = track_to_pts(avctx, x->track_no);
++
++ av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++ .discard = 0,
++ .pending = 1,
++ .pkt_size = 0,
++ .pts = frame->pts,
++ .dts = AV_NOPTS_VALUE,
++ .reordered_opaque = frame->reordered_opaque,
++ .pkt_pos = frame->pkt_pos,
++ .pkt_duration = frame->pkt_duration,
++ .track_pts = track_pts
++ };
++ return track_pts;
++}
++
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_frame_out(AVCodecContext *const avctx,
++ xlat_track_t * const x,
++ AVFrame *const frame)
++{
++ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
++ V4L2m2mTrackEl *const t = x->track_els + n;
++ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
++ {
++ av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++ "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++ frame->pts = AV_NOPTS_VALUE;
++ frame->pkt_dts = AV_NOPTS_VALUE;
++ frame->reordered_opaque = x->last_opaque;
++ frame->pkt_pos = -1;
++ frame->pkt_duration = 0;
++ frame->pkt_size = -1;
++ }
++ else if (!t->discard)
++ {
++ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE;
++ frame->pkt_dts = t->dts;
++ frame->reordered_opaque = t->reordered_opaque;
++ frame->pkt_pos = t->pkt_pos;
++ frame->pkt_duration = t->pkt_duration;
++ frame->pkt_size = t->pkt_size;
++
++ x->last_opaque = x->track_els[n].reordered_opaque;
++ if (frame->pts != AV_NOPTS_VALUE)
++ x->last_pts = frame->pts;
++ t->pending = 0;
++ }
++ else
++ {
++ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++ return -1;
++ }
++
++ av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
++ return 0;
++}
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_pkt_out(AVCodecContext *const avctx,
++ xlat_track_t * const x,
++ AVPacket *const pkt)
++{
++ unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
++ V4L2m2mTrackEl *const t = x->track_els + n;
++ if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
++ {
++ av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++ "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++ pkt->pts = AV_NOPTS_VALUE;
++ }
++ else if (!t->discard)
++ {
++ pkt->pts = t->pending ? t->pts : AV_NOPTS_VALUE;
++
++ x->last_opaque = x->track_els[n].reordered_opaque;
++ if (pkt->pts != AV_NOPTS_VALUE)
++ x->last_pts = pkt->pts;
++ t->pending = 0;
++ }
++ else
++ {
++ av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++ return -1;
++ }
++
++ // * Would like something much better than this...xlat(offset + out_count)?
++ pkt->dts = pkt->pts;
++ av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++ pkt->pts, t->track_pts, n);
++ return 0;
++}
++
++
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
++{
++ return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++ container_of(ctx, V4L2m2mContext, output) :
++ container_of(ctx, V4L2m2mContext, capture);
++}
++
++static inline AVCodecContext *logger(const V4L2Context *ctx)
++{
++ return ctx_to_m2mctx(ctx)->avctx;
+ }
+
+ static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Conte
+ return sar;
+ }
+
+-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
++static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
++{
++ return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+ {
+- struct v4l2_format *fmt1 = &ctx->format;
+- int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+- fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+- fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+- :
+- fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+- fmt1->fmt.pix.height != fmt2->fmt.pix.height;
++ const struct v4l2_format *fmt1 = &ctx->format;
++ int ret = !ctx_buffers_alloced(ctx) ||
++ (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
++ fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
++ fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
++ :
++ fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
++ fmt1->fmt.pix.height != fmt2->fmt.pix.height);
+
+ if (ret)
+- av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
++ av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
+ ctx->name,
+- v4l2_get_width(fmt1), v4l2_get_height(fmt1),
+- v4l2_get_width(fmt2), v4l2_get_height(fmt2));
++ ctx_buffers_alloced(ctx),
++ ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
++ ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
+
+ return ret;
+ }
+@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(
+ }
+ }
+
+-/**
+- * handle resolution change event and end of stream event
+- * returns 1 if reinit was successful, negative if it failed
+- * returns 0 if reinit was not executed
+- */
+-static int v4l2_handle_event(V4L2Context *ctx)
++static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
+ {
+- V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+- struct v4l2_format cap_fmt = s->capture.format;
+- struct v4l2_format out_fmt = s->output.format;
+- struct v4l2_event evt = { 0 };
+- int full_reinit, reinit, ret;
++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++ struct v4l2_selection selection = {
++ .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
++ .target = V4L2_SEL_TGT_COMPOSE
++ };
+
+- ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
+- if (ret < 0) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
+- return 0;
+- }
++ memset(r, 0, sizeof(*r));
++ if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
++ return AVERROR(errno);
+
+- if (evt.type == V4L2_EVENT_EOS) {
+- ctx->done = 1;
+- return 0;
+- }
++ *r = selection.r;
++ return 0;
++}
+
+- if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
+- return 0;
++static int do_source_change(V4L2m2mContext * const s)
++{
++ AVCodecContext *const avctx = s->avctx;
+
+- ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
+- if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
+- return 0;
+- }
++ int ret;
++ int reinit;
++ struct v4l2_format cap_fmt = s->capture.format;
++
++ s->capture.done = 0;
+
+ ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
+ if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
++ av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
+ return 0;
+ }
+
+- full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
+- if (full_reinit) {
+- s->output.height = v4l2_get_height(&out_fmt);
+- s->output.width = v4l2_get_width(&out_fmt);
+- s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
+- }
++ get_default_selection(&s->capture, &s->capture.selection);
++
++ reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
++ if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
++ reinit = 1;
+
+- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
++ s->capture.format = cap_fmt;
+ if (reinit) {
+- s->capture.height = v4l2_get_height(&cap_fmt);
+- s->capture.width = v4l2_get_width(&cap_fmt);
+- s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
++ s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
++ s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
+ }
+
+- if (full_reinit || reinit)
+- s->reinit = 1;
+-
+- if (full_reinit) {
+- ret = ff_v4l2_m2m_codec_full_reinit(s);
+- if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
+- return AVERROR(EINVAL);
+- }
+- goto reinit_run;
++ // If we don't support selection (or it is bust) and we obviously have HD then kludge
++ if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
++ (s->capture.height == 1088 && s->capture.width == 1920)) {
++ s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
+ }
+
++ s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
++
++ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
++ s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
++ s->capture.width, s->capture.height,
++ s->capture.selection.width, s->capture.selection.height,
++ s->capture.selection.left, s->capture.selection.top, reinit);
++
+ if (reinit) {
+- if (s->avctx)
+- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
++ if (avctx)
++ ret = ff_set_dimensions(s->avctx,
++ s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
++ s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
+ if (ret < 0)
+- av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
++ av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
+
+ ret = ff_v4l2_m2m_codec_reinit(s);
+ if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
++ av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
+ return AVERROR(EINVAL);
+ }
++
++ if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
++ s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
++ av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
++ s->capture.width, s->capture.height,
++ ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
++ return AVERROR(EINVAL);
++ }
++
++ // Update pixel format - should only actually do something on initial change
++ s->capture.av_pix_fmt =
++ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
++ if (s->output_drm) {
++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++ avctx->sw_pix_fmt = s->capture.av_pix_fmt;
++ }
++ else
++ avctx->pix_fmt = s->capture.av_pix_fmt;
++
+ goto reinit_run;
+ }
+
+- /* dummy event received */
+- return 0;
++ /* Buffers are OK so just stream off to ack */
++ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
++
++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++ if (ret)
++ av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
++ s->draining = 0;
+
+ /* reinit executed */
+ reinit_run:
++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
+ return 1;
+ }
+
+@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context
+ return 0;
+ }
+
+-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
+-{
+- struct v4l2_plane planes[VIDEO_MAX_PLANES];
+- struct v4l2_buffer buf = { 0 };
+- V4L2Buffer *avbuf;
+- struct pollfd pfd = {
+- .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
+- .fd = ctx_to_m2mctx(ctx)->fd,
++// DQ a buffer
++// Amalgamates all the various ways there are of signalling EOS/Event to
++// generate a consistant EPIPE.
++//
++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
++//
++// Returns:
++// 0 Success
++// AVERROR(EPIPE) Nothing more to read
++// AVERROR(ENOSPC) No buffers in Q to put result in
++// * AVERROR(..)
++
++ static int
++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
++{
++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++ AVCodecContext * const avctx = m->avctx;
++ V4L2Buffer * avbuf;
++ const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
++
++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++ struct v4l2_buffer buf = {
++ .type = ctx->type,
++ .memory = V4L2_MEMORY_MMAP,
+ };
+- int i, ret;
+
+- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
+- for (i = 0; i < ctx->num_buffers; i++) {
+- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+- break;
+- }
+- if (i == ctx->num_buffers)
+- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
+- "userspace. Increase num_capture_buffers "
+- "to prevent device deadlock or dropped "
+- "packets/frames.\n");
+- }
+-
+- /* if we are draining and there are no more capture buffers queued in the driver we are done */
+- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
+- for (i = 0; i < ctx->num_buffers; i++) {
+- /* capture buffer initialization happens during decode hence
+- * detection happens at runtime
+- */
+- if (!ctx->buffers)
+- break;
+-
+- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+- goto start;
+- }
+- ctx->done = 1;
+- return NULL;
+- }
+-
+-start:
+- if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+- pfd.events = POLLOUT | POLLWRNORM;
+- else {
+- /* no need to listen to requests for more input while draining */
+- if (ctx_to_m2mctx(ctx)->draining)
+- pfd.events = POLLIN | POLLRDNORM | POLLPRI;
++ *ppavbuf = NULL;
++
++ if (ctx->flag_last)
++ return AVERROR(EPIPE);
++
++ if (is_mp) {
++ buf.length = VIDEO_MAX_PLANES;
++ buf.m.planes = planes;
+ }
+
+- for (;;) {
+- ret = poll(&pfd, 1, timeout);
+- if (ret > 0)
+- break;
+- if (errno == EINTR)
+- continue;
+- return NULL;
++ while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
++ const int err = errno;
++ av_assert0(AVERROR(err) < 0);
++ if (err != EINTR) {
++ av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
++ ctx->name, av_err2str(AVERROR(err)));
++
++ if (err == EPIPE)
++ ctx->flag_last = 1;
++
++ return AVERROR(err);
++ }
+ }
++ atomic_fetch_sub(&ctx->q_count, 1);
+
+- /* 0. handle errors */
+- if (pfd.revents & POLLERR) {
+- /* if we are trying to get free buffers but none have been queued yet
+- no need to raise a warning */
+- if (timeout == 0) {
+- for (i = 0; i < ctx->num_buffers; i++) {
+- if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+- }
++ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
++ ff_v4l2_buffer_set_avail(avbuf);
++ avbuf->buf = buf;
++ if (is_mp) {
++ memcpy(avbuf->planes, planes, sizeof(planes));
++ avbuf->buf.m.planes = avbuf->planes;
++ }
++ // Done with any attached buffer
++ av_buffer_unref(&avbuf->ref_buf);
++
++ if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
++ // Zero length cap buffer return == EOS
++ if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
++
++ // Must reQ so we don't leak
++ // May not matter if the next thing we do is release all the
++ // buffers but better to be tidy.
++ ff_v4l2_buffer_enqueue(avbuf);
++
++ ctx->flag_last = 1;
++ return AVERROR(EPIPE);
+ }
+- else
+- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+
+- return NULL;
++#ifdef V4L2_BUF_FLAG_LAST
++ // If flag_last set then this contains data but is the last frame
++ // so remember that but return OK
++ if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
++ ctx->flag_last = 1;
++#endif
+ }
+
+- /* 1. handle resolution changes */
+- if (pfd.revents & POLLPRI) {
+- ret = v4l2_handle_event(ctx);
+- if (ret < 0) {
+- /* if re-init failed, abort */
+- ctx->done = 1;
+- return NULL;
+- }
+- if (ret) {
+- /* if re-init was successful drop the buffer (if there was one)
+- * since we had to reconfigure capture (unmap all buffers)
+- */
+- return NULL;
++ *ppavbuf = avbuf;
++ return 0;
++}
++
++/**
++ * handle resolution change event and end of stream event
++ * Expects to be called after the stream has stopped
++ *
++ * returns 1 if reinit was successful, negative if it failed
++ * returns 0 if reinit was not executed
++ */
++static int
++get_event(V4L2m2mContext * const m)
++{
++ AVCodecContext * const avctx = m->avctx;
++ struct v4l2_event evt = { 0 };
++
++ while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
++ const int rv = AVERROR(errno);
++ if (rv == AVERROR(EINTR))
++ continue;
++ if (rv == AVERROR(EAGAIN)) {
++ av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
++ return AVERROR_EOF;
+ }
++ av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
++ return rv;
+ }
+
+- /* 2. dequeue the buffer */
+- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
++ av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
+
+- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+- /* there is a capture buffer ready */
+- if (pfd.revents & (POLLIN | POLLRDNORM))
+- goto dequeue;
++ if (evt.type == V4L2_EVENT_EOS) {
++ av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
++ return AVERROR_EOF;
++ }
++
++ if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
++ return do_source_change(m);
++
++ return 0;
++}
++
++
++// Get a buffer
++// If output then just gets the buffer in the expected way
++// If capture then runs the capture state m/c to deal with res change etc.
++// If return value == 0 then *ppavbuf != NULL
++
++static int
++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
++{
++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++ AVCodecContext * const avctx = m->avctx;
++ const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
++
++ const unsigned int poll_cap = (POLLIN | POLLRDNORM);
++ const unsigned int poll_out = (POLLOUT | POLLWRNORM);
++ const unsigned int poll_event = POLLPRI;
++
++ *ppavbuf = NULL;
+
+- /* the driver is ready to accept more input; instead of waiting for the capture
+- * buffer to complete we return NULL so input can proceed (we are single threaded)
+- */
+- if (pfd.revents & (POLLOUT | POLLWRNORM))
+- return NULL;
++ for (;;) {
++ struct pollfd pfd = {
++ .fd = m->fd,
++ // If capture && stream not started then assume we are waiting for the initial event
++ .events = !is_cap ? poll_out :
++ !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
++ poll_event,
++ };
++ int ret;
++
++ if (ctx->done) {
++ av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
++ return AVERROR_EOF;
+ }
+
+-dequeue:
+- memset(&buf, 0, sizeof(buf));
+- buf.memory = V4L2_MEMORY_MMAP;
+- buf.type = ctx->type;
+- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+- memset(planes, 0, sizeof(planes));
+- buf.length = VIDEO_MAX_PLANES;
+- buf.m.planes = planes;
++ // If capture && timeout == -1 then also wait for rx buffer free
++ if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
++ pfd.events |= poll_out;
++
++ // If nothing Qed all we will get is POLLERR - avoid that
++ if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
++ (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
++ (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
++ return AVERROR(ENOSPC);
+ }
+
+- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
+- if (ret) {
+- if (errno != EAGAIN) {
+- ctx->done = 1;
+- if (errno != EPIPE)
+- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+- ctx->name, av_err2str(AVERROR(errno)));
++ // Timeout kludged s.t. "forever" eventually gives up & produces logging
++ // If waiting for an event when we have seen a last_frame then we expect
++ // it to be ready already so force a short timeout
++ ret = poll(&pfd, 1,
++ ff_v4l2_ctx_eos(ctx) ? 10 :
++ timeout == -1 ? 3000 : timeout);
++ if (ret < 0) {
++ ret = AVERROR(errno); // Remember errno before logging etc.
++ av_assert0(ret < 0);
++ }
++
++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
++ ctx->name, ret, timeout, pfd.events, pfd.revents);
++
++ if (ret < 0) {
++ if (ret == AVERROR(EINTR))
++ continue;
++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
++ return ret;
++ }
++
++ if (ret == 0) {
++ if (timeout == -1)
++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
++ if (ff_v4l2_ctx_eos(ctx)) {
++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
++ ret = get_event(m);
++ if (ret < 0) {
++ ctx->done = 1;
++ return ret;
++ }
+ }
+- return NULL;
++ return AVERROR(EAGAIN);
+ }
+
+- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+- int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
+- buf.m.planes[0].bytesused : buf.bytesused;
+- if (bytesused == 0) {
++ if ((pfd.revents & POLLERR) != 0) {
++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
++ return AVERROR_UNKNOWN;
++ }
++
++ if ((pfd.revents & poll_event) != 0) {
++ ret = get_event(m);
++ if (ret < 0) {
+ ctx->done = 1;
+- return NULL;
++ return ret;
+ }
+-#ifdef V4L2_BUF_FLAG_LAST
+- if (buf.flags & V4L2_BUF_FLAG_LAST)
+- ctx->done = 1;
+-#endif
++ continue;
++ }
++
++ if ((pfd.revents & poll_cap) != 0) {
++ ret = dq_buf(ctx, ppavbuf);
++ if (ret == AVERROR(EPIPE))
++ continue;
++ return ret;
+ }
+
+- avbuf = &ctx->buffers[buf.index];
+- avbuf->status = V4L2BUF_AVAILABLE;
+- avbuf->buf = buf;
+- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+- memcpy(avbuf->planes, planes, sizeof(planes));
+- avbuf->buf.m.planes = avbuf->planes;
++ if ((pfd.revents & poll_out) != 0) {
++ if (is_cap)
++ return AVERROR(EAGAIN);
++ return dq_buf(ctx, ppavbuf);
+ }
+- return avbuf;
++
++ av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
++ return AVERROR_UNKNOWN;
+ }
++}
+
+- return NULL;
++// Clear out flags and timestamps that should should be set by the user
++// Returns the passed avbuf
++static V4L2Buffer *
++clean_v4l2_buffer(V4L2Buffer * const avbuf)
++{
++ struct v4l2_buffer *const buf = &avbuf->buf;
++
++ buf->flags = 0;
++ buf->field = V4L2_FIELD_ANY;
++ buf->timestamp = (struct timeval){0};
++ buf->timecode = (struct v4l2_timecode){0};
++ buf->sequence = 0;
++
++ return avbuf;
+ }
+
+ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ {
+- int timeout = 0; /* return when no more buffers to dequeue */
+ int i;
+
+ /* get back as many output buffers as possible */
+ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+- do {
+- } while (v4l2_dequeue_v4l2buf(ctx, timeout));
++ V4L2Buffer * avbuf;
++ do {
++ get_qbuf(ctx, &avbuf, 0);
++ } while (avbuf);
+ }
+
+ for (i = 0; i < ctx->num_buffers; i++) {
+- if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
+- return &ctx->buffers[i];
++ V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
++ if (avbuf->status == V4L2BUF_AVAILABLE)
++ return clean_v4l2_buffer(avbuf);
+ }
+
+ return NULL;
+@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
+
+ static int v4l2_release_buffers(V4L2Context* ctx)
+ {
+- struct v4l2_requestbuffers req = {
+- .memory = V4L2_MEMORY_MMAP,
+- .type = ctx->type,
+- .count = 0, /* 0 -> unmaps buffers from the driver */
+- };
+- int i, j;
++ int i;
++ int ret = 0;
++ const int fd = ctx_to_m2mctx(ctx)->fd;
+
+- for (i = 0; i < ctx->num_buffers; i++) {
+- V4L2Buffer *buffer = &ctx->buffers[i];
++ // Orphan any buffers in the wild
++ ff_weak_link_break(&ctx->wl_master);
+
+- for (j = 0; j < buffer->num_planes; j++) {
+- struct V4L2Plane_info *p = &buffer->plane_info[j];
+- if (p->mm_addr && p->length)
+- if (munmap(p->mm_addr, p->length) < 0)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
++ if (ctx->bufrefs) {
++ for (i = 0; i < ctx->num_buffers; i++)
++ av_buffer_unref(ctx->bufrefs + i);
++ }
++
++ if (fd != -1) {
++ struct v4l2_requestbuffers req = {
++ .memory = V4L2_MEMORY_MMAP,
++ .type = ctx->type,
++ .count = 0, /* 0 -> unmap all buffers from the driver */
++ };
++
++ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
++ if (errno == EINTR)
++ continue;
++
++ ret = AVERROR(errno);
++
++ av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
++ ctx->name, av_err2str(AVERROR(errno)));
++
++ if (ctx_to_m2mctx(ctx)->output_drm)
++ av_log(logger(ctx), AV_LOG_ERROR,
++ "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
++ "for all buffers: \n"
++ " 1. drmModeRmFB(..)\n"
++ " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
+ }
+ }
++ atomic_store(&ctx->q_count, 0);
+
+- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
++ return ret;
+ }
+
+ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
+@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4
+
+ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+ {
++ V4L2m2mContext* s = ctx_to_m2mctx(ctx);
++ V4L2m2mPriv *priv = s->avctx->priv_data;
+ enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
+ struct v4l2_fmtdesc fdesc;
+ int ret;
+@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Conte
+ if (ret)
+ return AVERROR(EINVAL);
+
++ if (priv->pix_fmt != AV_PIX_FMT_NONE) {
++ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
++ fdesc.index++;
++ continue;
++ }
++ }
++
+ pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
+ ret = v4l2_try_raw_format(ctx, pixfmt);
+ if (ret){
+@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Con
+ *
+ *****************************************************************************/
+
++
++static void flush_all_buffers_status(V4L2Context* const ctx)
++{
++ int i;
++
++ if (!ctx->bufrefs)
++ return;
++
++ for (i = 0; i < ctx->num_buffers; ++i) {
++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++ if (buf->status == V4L2BUF_IN_DRIVER)
++ ff_v4l2_buffer_set_avail(buf);
++ }
++ atomic_store(&ctx->q_count, 0);
++}
++
++static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
++{
++ int i;
++ int rv;
++
++ if (!ctx->bufrefs) {
++ rv = ff_v4l2_context_init(ctx);
++ if (rv) {
++ av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
++ return rv;
++ }
++ }
++
++ for (i = 0; i < ctx->num_buffers; ++i) {
++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++ if (buf->status == V4L2BUF_AVAILABLE) {
++ rv = ff_v4l2_buffer_enqueue(buf);
++ if (rv < 0)
++ return rv;
++ }
++ }
++ return 0;
++}
++
+ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
+ {
+ int type = ctx->type;
+- int ret;
++ int ret = 0;
++ AVCodecContext * const avctx = logger(ctx);
+
+- ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
+- if (ret < 0)
+- return AVERROR(errno);
++ // Avoid doing anything if there is nothing we can do
++ if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
++ return 0;
+
+- ctx->streamon = (cmd == VIDIOC_STREAMON);
++ ff_mutex_lock(&ctx->lock);
+
+- return 0;
++ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
++ stuff_all_buffers(avctx, ctx);
++
++ if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
++ const int err = errno;
++ av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
++ ret = AVERROR(err);
++ }
++ else
++ {
++ if (cmd == VIDIOC_STREAMOFF)
++ flush_all_buffers_status(ctx);
++ else
++ ctx->first_buf = 1;
++
++ ctx->streamon = (cmd == VIDIOC_STREAMON);
++ av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
++ }
++
++ // Both stream off & on effectively clear flag_last
++ ctx->flag_last = 0;
++
++ ff_mutex_unlock(&ctx->lock);
++
++ return ret;
+ }
+
+ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+ {
+- V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
++ int64_t track_ts;
+ V4L2Buffer* avbuf;
+ int ret;
+
+ if (!frame) {
+ ret = v4l2_stop_encode(ctx);
+ if (ret)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
++ av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+ s->draining= 1;
+ return 0;
+ }
+@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
+ if (!avbuf)
+ return AVERROR(ENOMEM);
+
+- ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
++ track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
++
++ ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
+ if (ret)
+ return ret;
+
+ return ff_v4l2_buffer_enqueue(avbuf);
+ }
+
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
++ const void * extdata, size_t extlen)
+ {
+ V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
+ V4L2Buffer* avbuf;
+ int ret;
++ int64_t track_ts;
+
+ if (!pkt->size) {
+ ret = v4l2_stop_decode(ctx);
++ // Log but otherwise ignore stop failure
+ if (ret)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
++ av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
+ s->draining = 1;
+ return 0;
+ }
+@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+ if (!avbuf)
+ return AVERROR(EAGAIN);
+
+- ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
+- if (ret)
++ track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
++
++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
++ if (ret == AVERROR(ENOMEM))
++ av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
++ __func__, pkt->size, avbuf->planes[0].length);
++ else if (ret)
+ return ret;
+
+ return ff_v4l2_buffer_enqueue(avbuf);
+@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+ {
++ V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
+ V4L2Buffer *avbuf;
++ int rv;
+
+- /*
+- * timeout=-1 blocks until:
+- * 1. decoded frame available
+- * 2. an input buffer is ready to be dequeued
+- */
+- avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
+- if (!avbuf) {
+- if (ctx->done)
+- return AVERROR_EOF;
+-
+- return AVERROR(EAGAIN);
+- }
++ do {
++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++ return rv;
++ if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
++ return rv;
++ } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
+
+- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
++ return 0;
+ }
+
+ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
+ {
++ V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
+ V4L2Buffer *avbuf;
++ int rv;
+
+- /*
+- * blocks until:
+- * 1. encoded packet available
+- * 2. an input buffer ready to be dequeued
+- */
+- avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+- if (!avbuf) {
+- if (ctx->done)
+- return AVERROR_EOF;
++ do {
++ if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
++ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC
++ if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
++ return rv;
++ } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
+
+- return AVERROR(EAGAIN);
+- }
+-
+- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++ return 0;
+ }
+
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Conte
+
+ int ff_v4l2_context_set_format(V4L2Context* ctx)
+ {
+- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++ int ret;
++
++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++ if (ret != 0)
++ return ret;
++
++ // Check returned size against min size and if smaller have another go
++ // Only worry about plane[0] as this is meant to enforce limits for
++ // encoded streams where we might know a bit more about the shape
++ // than the driver
++ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++ if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
++ return 0;
++ ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
++ }
++ else {
++ if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
++ return 0;
++ ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
++ }
++
++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++ return ret;
+ }
+
+ void ff_v4l2_context_release(V4L2Context* ctx)
+ {
+ int ret;
+
+- if (!ctx->buffers)
++ if (!ctx->bufrefs)
+ return;
+
+ ret = v4l2_release_buffers(ctx);
+ if (ret)
+ av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
+
+- av_freep(&ctx->buffers);
++ av_freep(&ctx->bufrefs);
++ av_buffer_unref(&ctx->frames_ref);
++
++ ff_mutex_destroy(&ctx->lock);
++ pthread_cond_destroy(&ctx->cond);
+ }
+
+-int ff_v4l2_context_init(V4L2Context* ctx)
++
++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
+ {
+- V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+ struct v4l2_requestbuffers req;
+- int ret, i;
+-
+- if (!v4l2_type_supported(ctx)) {
+- av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+- return AVERROR_PATCHWELCOME;
+- }
++ int ret;
++ int i;
+
+- ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+- if (ret)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
++ av_assert0(ctx->bufrefs == NULL);
+
+ memset(&req, 0, sizeof(req));
+- req.count = ctx->num_buffers;
+- req.memory = V4L2_MEMORY_MMAP;
++ req.count = req_buffers;
++ req.memory = mem;
+ req.type = ctx->type;
+- ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
+- if (ret < 0) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
+- return AVERROR(errno);
++ while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
++ if (errno != EINTR) {
++ ret = AVERROR(errno);
++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
++ return ret;
++ }
+ }
+
+ ctx->num_buffers = req.count;
+- ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
+- if (!ctx->buffers) {
++ ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
++ if (!ctx->bufrefs) {
+ av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
+- return AVERROR(ENOMEM);
++ goto fail_release;
+ }
+
+- for (i = 0; i < req.count; i++) {
+- ctx->buffers[i].context = ctx;
+- ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
+- if (ret < 0) {
++ ctx->wl_master = ff_weak_link_new(ctx);
++ if (!ctx->wl_master) {
++ ret = AVERROR(ENOMEM);
++ goto fail_release;
++ }
++
++ for (i = 0; i < ctx->num_buffers; i++) {
++ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
++ if (ret) {
+ av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
+- goto error;
++ goto fail_release;
+ }
+ }
+
+ av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
+ V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
+ req.count,
+- v4l2_get_width(&ctx->format),
+- v4l2_get_height(&ctx->format),
++ ff_v4l2_get_format_width(&ctx->format),
++ ff_v4l2_get_format_height(&ctx->format),
+ V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
+ V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
+
+ return 0;
+
+-error:
++fail_release:
+ v4l2_release_buffers(ctx);
++ av_freep(&ctx->bufrefs);
++ return ret;
++}
++
++int ff_v4l2_context_init(V4L2Context* ctx)
++{
++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++ int ret;
++
++ // It is not valid to reinit a context without a previous release
++ av_assert0(ctx->bufrefs == NULL);
+
+- av_freep(&ctx->buffers);
++ if (!v4l2_type_supported(ctx)) {
++ av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
++ return AVERROR_PATCHWELCOME;
++ }
++
++ ff_mutex_init(&ctx->lock, NULL);
++ pthread_cond_init(&ctx->cond, NULL);
++ atomic_init(&ctx->q_count, 0);
++
++ if (s->output_drm) {
++ AVHWFramesContext *hwframes;
++
++ ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
++ if (!ctx->frames_ref) {
++ ret = AVERROR(ENOMEM);
++ goto fail_unlock;
++ }
++
++ hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
++ hwframes->format = AV_PIX_FMT_DRM_PRIME;
++ hwframes->sw_format = ctx->av_pix_fmt;
++ hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
++ hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
++ ret = av_hwframe_ctx_init(ctx->frames_ref);
++ if (ret < 0)
++ goto fail_unref_hwframes;
++ }
++
++ ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
++ goto fail_unref_hwframes;
++ }
++
++ ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
++ if (ret < 0)
++ goto fail_unref_hwframes;
++
++ return 0;
+
++fail_unref_hwframes:
++ av_buffer_unref(&ctx->frames_ref);
++fail_unlock:
++ ff_mutex_destroy(&ctx->lock);
+ return ret;
+ }
+--- a/libavcodec/v4l2_context.h
++++ b/libavcodec/v4l2_context.h
+@@ -31,6 +31,7 @@
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/frame.h"
+ #include "libavutil/buffer.h"
++#include "libavutil/thread.h"
+ #include "v4l2_buffers.h"
+
+ typedef struct V4L2Context {
+@@ -70,11 +71,18 @@ typedef struct V4L2Context {
+ */
+ int width, height;
+ AVRational sample_aspect_ratio;
++ struct v4l2_rect selection;
+
+ /**
+- * Indexed array of V4L2Buffers
++ * If the default size of buffer is less than this then try to
++ * set to this.
+ */
+- V4L2Buffer *buffers;
++ uint32_t min_buf_size;
++
++ /**
++ * Indexed array of pointers to V4L2Buffers
++ */
++ AVBufferRef **bufrefs;
+
+ /**
+ * Readonly after init.
+@@ -82,16 +90,38 @@ typedef struct V4L2Context {
+ int num_buffers;
+
+ /**
++ * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
++ */
++ enum v4l2_memory buf_mem;
++
++ /**
+ * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+ */
+ int streamon;
+
++ /* 1st buffer after stream on */
++ int first_buf;
++
+ /**
+ * Either no more buffers available or an unrecoverable error was notified
+ * by the V4L2 kernel driver: once set the context has to be exited.
+ */
+ int done;
+
++ int flag_last;
++
++ /**
++ * If NZ then when Qing frame/pkt use this rather than the
++ * "real" PTS
++ */
++ uint64_t track_ts;
++
++ AVBufferRef *frames_ref;
++ atomic_int q_count;
++ struct ff_weak_link_master *wl_master;
++
++ AVMutex lock;
++ pthread_cond_t cond;
+ } V4L2Context;
+
+ /**
+@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2C
+ * @param[in] ctx The V4L2Context to dequeue from.
+ * @param[inout] f The AVFrame to dequeue to.
+ * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
++ *
+ * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
++ * AVERROR(ENOSPC) if no buffer availible to put
++ * the frame in
+ */
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+
+@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
+ * @param[in] pkt A pointer to an AVPacket.
+ * @return 0 in case of success, a negative error otherwise.
+ */
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
+
+ /**
+ * Enqueues a buffer to a V4L2Context from an AVFrame
+--- a/libavcodec/v4l2_m2m.c
++++ b/libavcodec/v4l2_m2m.c
+@@ -36,6 +36,14 @@
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
+
++static void
++xlat_init(xlat_track_t * const x)
++{
++ memset(x, 0, sizeof(*x));
++ x->last_pts = AV_NOPTS_VALUE;
++}
++
++
+ static inline int v4l2_splane_video(struct v4l2_capability *cap)
+ {
+ if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
+@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2m
+
+ s->capture.done = s->output.done = 0;
+ s->capture.name = "capture";
++ s->capture.buf_mem = V4L2_MEMORY_MMAP;
+ s->output.name = "output";
++ s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+ atomic_init(&s->refcount, 0);
+ sem_init(&s->refsync, 0, 0);
+
+@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+ av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
+
+ /* 2. unmap the capture buffers (v4l2 and ffmpeg):
+- * we must wait for all references to be released before being allowed
+- * to queue new buffers.
+ */
+- av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
+- if (atomic_load(&s->refcount))
+- while(sem_wait(&s->refsync) == -1 && errno == EINTR);
+-
+ ff_v4l2_context_release(&s->capture);
+
+ /* 3. get the new capture format */
+@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+
+ /* 5. complete reinit */
+ s->draining = 0;
+- s->reinit = 0;
+
+ return 0;
+ }
+@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2
+
+ /* start again now that we know the stream dimensions */
+ s->draining = 0;
+- s->reinit = 0;
+
+ ret = ff_v4l2_context_get_format(&s->output, 0);
+ if (ret) {
+@@ -328,7 +330,13 @@ static void v4l2_m2m_destroy_context(voi
+ ff_v4l2_context_release(&s->capture);
+ sem_destroy(&s->refsync);
+
+- close(s->fd);
++ if (s->fd != -1)
++ close(s->fd);
++
++ av_packet_unref(&s->buf_pkt);
++ av_freep(&s->extdata_data);
++
++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
+
+ av_free(s);
+ }
+@@ -338,17 +346,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
+ V4L2m2mContext *s = priv->context;
+ int ret;
+
+- ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
+- if (ret)
+- av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
++ if (!s)
++ return 0;
+
+- ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+- if (ret)
+- av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
++
++ if (av_codec_is_decoder(s->avctx->codec))
++ av_packet_unref(&s->buf_pkt);
++
++ if (s->fd >= 0) {
++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
++ if (ret)
++ av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
++
++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++ if (ret)
++ av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
++ }
+
+ ff_v4l2_context_release(&s->output);
+
++ close(s->fd);
++ s->fd = -1;
++
+ s->self_ref = NULL;
++ // This is only called on avctx close so after this point we don't have that
++ // Crash sooner if we find we are using it (can still log with avctx = NULL)
++ s->avctx = NULL;
++ priv->context = NULL;
+ av_buffer_unref(&priv->context_ref);
+
+ return 0;
+@@ -392,28 +417,33 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *
+ return v4l2_configure_contexts(s);
+ }
+
+-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
+ {
+- *s = av_mallocz(sizeof(V4L2m2mContext));
+- if (!*s)
++ V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
++
++ *pps = NULL;
++ if (!s)
+ return AVERROR(ENOMEM);
+
+- priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
++ priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
+ &v4l2_m2m_destroy_context, NULL, 0);
+ if (!priv->context_ref) {
+- av_freep(s);
++ av_free(s);
+ return AVERROR(ENOMEM);
+ }
+
+ /* assign the context */
+- priv->context = *s;
+- (*s)->priv = priv;
++ priv->context = s;
++ s->priv = priv;
+
+ /* populate it */
+- priv->context->capture.num_buffers = priv->num_capture_buffers;
+- priv->context->output.num_buffers = priv->num_output_buffers;
+- priv->context->self_ref = priv->context_ref;
+- priv->context->fd = -1;
++ s->capture.num_buffers = priv->num_capture_buffers;
++ s->output.num_buffers = priv->num_output_buffers;
++ s->self_ref = priv->context_ref;
++ s->fd = -1;
++
++ xlat_init(&s->xlat);
+
++ *pps = s;
+ return 0;
+ }
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -30,6 +30,7 @@
+ #include <linux/videodev2.h>
+
+ #include "libavcodec/avcodec.h"
++#include "libavutil/pixfmt.h"
+ #include "v4l2_context.h"
+
+ #define container_of(ptr, type, member) ({ \
+@@ -38,7 +39,37 @@
+
+ #define V4L_M2M_DEFAULT_OPTS \
+ { "num_output_buffers", "Number of buffers in the output context",\
+- OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
++ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
++
++#define FF_V4L2_M2M_TRACK_SIZE 128
++typedef struct V4L2m2mTrackEl {
++ int discard; // If we see this buffer its been flushed, so discard
++ int pending;
++ int pkt_size;
++ int64_t pts;
++ int64_t dts;
++ int64_t reordered_opaque;
++ int64_t pkt_pos;
++ int64_t pkt_duration;
++ int64_t track_pts;
++} V4L2m2mTrackEl;
++
++typedef struct pts_stats_s
++{
++ void * logctx;
++ const char * name; // For debug
++ unsigned int last_count;
++ unsigned int last_interval;
++ int64_t last_pts;
++ int64_t guess;
++} pts_stats_t;
++
++typedef struct xlat_track_s {
++ unsigned int track_no;
++ int64_t last_pts;
++ int64_t last_opaque;
++ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++} xlat_track_t;
+
+ typedef struct V4L2m2mContext {
+ char devname[PATH_MAX];
+@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext {
+ AVCodecContext *avctx;
+ sem_t refsync;
+ atomic_uint refcount;
+- int reinit;
+
+ /* null frame/packet received */
+ int draining;
+@@ -63,6 +93,36 @@ typedef struct V4L2m2mContext {
+
+ /* reference back to V4L2m2mPriv */
+ void *priv;
++
++ AVBufferRef *device_ref;
++
++ /* generate DRM frames */
++ int output_drm;
++
++ /* input frames are drmprime */
++ int input_drm;
++
++ /* Frame tracking */
++ xlat_track_t xlat;
++ int pending_hw;
++ int pending_n;
++
++ pts_stats_t pts_stat;
++
++ /* req pkt */
++ int req_pkt;
++
++ /* Ext data sent */
++ int extdata_sent;
++ /* Ext data sent in packet - overrides ctx */
++ uint8_t * extdata_data;
++ size_t extdata_size;
++
++#define FF_V4L2_QUIRK_REINIT_ALWAYS 1
++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2
++ /* Quirks */
++ unsigned int quirks;
++
+ } V4L2m2mContext;
+
+ typedef struct V4L2m2mPriv {
+@@ -73,6 +133,7 @@ typedef struct V4L2m2mPriv {
+
+ int num_output_buffers;
+ int num_capture_buffers;
++ enum AVPixelFormat pix_fmt;
+ } V4L2m2mPriv;
+
+ /**
+@@ -126,4 +187,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+ */
+ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
+
++
++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++}
++
++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++}
++
++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
++{
++ return ctx->flag_last;
++}
++
++
+ #endif /* AVCODEC_V4L2_M2M_H */
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -23,6 +23,10 @@
+
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/pixdesc.h"
+ #include "libavutil/opt.h"
+@@ -30,75 +34,111 @@
+ #include "libavcodec/decode.h"
+ #include "libavcodec/internal.h"
+
++#include "libavcodec/hwaccels.h"
++#include "libavcodec/internal.h"
++#include "libavcodec/hwconfig.h"
++
+ #include "v4l2_context.h"
+ #include "v4l2_m2m.h"
+ #include "v4l2_fmt.h"
+
+-static int v4l2_try_start(AVCodecContext *avctx)
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++
++#ifndef FF_API_BUFFER_SIZE_T
++#define FF_API_BUFFER_SIZE_T 1
++#endif
++
++static int64_t pts_stats_guess(const pts_stats_t * const stats)
+ {
+- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+- V4L2Context *const capture = &s->capture;
+- V4L2Context *const output = &s->output;
+- struct v4l2_selection selection = { 0 };
+- int ret;
++ if (stats->last_pts == AV_NOPTS_VALUE ||
++ stats->last_interval == 0 ||
++ stats->last_count >= STATS_LAST_COUNT_MAX)
++ return AV_NOPTS_VALUE;
++ return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
++}
+
+- /* 1. start the output process */
+- if (!output->streamon) {
+- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+- if (ret < 0) {
+- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
+- return ret;
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++ if (stats->last_count < STATS_LAST_COUNT_MAX)
++ ++stats->last_count;
++ return;
++ }
++
++ if (stats->last_pts != AV_NOPTS_VALUE) {
++ const int64_t interval = pts - stats->last_pts;
++
++ if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++ stats->last_count >= STATS_LAST_COUNT_MAX) {
++ if (stats->last_interval != 0)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++ __func__, stats->name, interval, stats->last_count);
++ stats->last_interval = 0;
++ }
++ else {
++ const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++ if (frame_time != stats->last_interval)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++ stats->last_interval = frame_time;
+ }
+ }
+
+- if (capture->streamon)
++ stats->last_pts = pts;
++ stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++ *stats = (pts_stats_t){
++ .logctx = logctx,
++ .name = name,
++ .last_count = 1,
++ .last_interval = 0,
++ .last_pts = AV_NOPTS_VALUE
++ };
++}
++
++static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
++{
++ int ret;
++ struct v4l2_decoder_cmd cmd = {
++ .cmd = V4L2_DEC_CMD_START,
++ .flags = 0,
++ };
++
++ if (s->output.streamon)
+ return 0;
+
+- /* 2. get the capture format */
+- capture->format.type = capture->type;
+- ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
+- if (ret) {
+- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
++ if (ret != 0) {
++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+ return ret;
+ }
+
+- /* 2.1 update the AVCodecContext */
+- avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
+- capture->av_pix_fmt = avctx->pix_fmt;
+-
+- /* 3. set the crop parameters */
+- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+- selection.r.height = avctx->coded_height;
+- selection.r.width = avctx->coded_width;
+- ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
+- if (!ret) {
+- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+- if (ret) {
+- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
+- } else {
+- av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
+- /* update the size of the resulting frame */
+- capture->height = selection.r.height;
+- capture->width = selection.r.width;
+- }
++ // STREAMON should do implicit START so this just for those that don't.
++ // It is optional so don't worry if it fails
++ if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
++ ret = AVERROR(errno);
++ av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
+ }
+-
+- /* 4. init the capture context now that we have the capture format */
+- if (!capture->buffers) {
+- ret = ff_v4l2_context_init(capture);
+- if (ret) {
+- av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
+- return AVERROR(ENOMEM);
+- }
++ else {
++ av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
+ }
++ return 0;
++}
+
+- /* 5. start the capture process */
+- ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+- if (ret) {
+- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
+- return ret;
+- }
++static int v4l2_try_start(AVCodecContext *avctx)
++{
++ V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++ int ret;
+
++ /* 1. start the output process */
++ if ((ret = check_output_streamon(avctx, s)) != 0)
++ return ret;
+ return 0;
+ }
+
+@@ -133,52 +173,525 @@ static int v4l2_prepare_decoder(V4L2m2mC
+ return 0;
+ }
+
+-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++static void
++set_best_effort_pts(AVCodecContext *const avctx,
++ pts_stats_t * const ps,
++ AVFrame *const frame)
++{
++ pts_stats_add(ps, frame->pts);
++
++#if FF_API_PKT_PTS
++FF_DISABLE_DEPRECATION_WARNINGS
++ frame->pkt_pts = frame->pts;
++FF_ENABLE_DEPRECATION_WARNINGS
++#endif
++ frame->best_effort_timestamp = pts_stats_guess(ps);
++ // If we can't guess from just PTS - try DTS
++ if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
++ frame->best_effort_timestamp = frame->pkt_dts;
++
++ // We can't emulate what s/w does in a useful manner and using the
++ // "correct" answer seems to just confuse things.
++ frame->pkt_dts = frame->pts;
++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
++}
++
++static void
++xlat_flush(xlat_track_t * const x)
++{
++ unsigned int i;
++ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
++ x->track_els[i].pending = 0;
++ x->track_els[i].discard = 1;
++ }
++ x->last_pts = AV_NOPTS_VALUE;
++}
++
++static int
++xlat_pending(const xlat_track_t * const x)
++{
++ unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
++ unsigned int i;
++ int r = 0;
++ int64_t now = AV_NOPTS_VALUE;
++
++ for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
++ const V4L2m2mTrackEl * const t = x->track_els + n;
++
++ if (!t->pending)
++ continue;
++
++ if (now == AV_NOPTS_VALUE)
++ now = t->dts;
++
++ if (t->pts == AV_NOPTS_VALUE ||
++ ((now == AV_NOPTS_VALUE || t->pts <= now) &&
++ (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
++ ++r;
++ }
++
++ // If we never get any ideas about PTS vs DTS allow a lot more buffer
++ if (now == AV_NOPTS_VALUE)
++ r -= 16;
++
++ return r;
++}
++
++static inline int stream_started(const V4L2m2mContext * const s) {
++ return s->output.streamon;
++}
++
++#define NQ_OK 0
++#define NQ_Q_FULL 1
++#define NQ_SRC_EMPTY 2
++#define NQ_NONE 3
++#define NQ_DRAINING 4
++#define NQ_DEAD 5
++
++#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
++
++// do_not_get If true then no new packet will be got but status will
++// be set appropriately
++
++// AVERROR_EOF Flushing an already flushed stream
++// -ve Error (all errors except EOF are unexpected)
++// NQ_OK (0) OK
++// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now)
++// NQ_SRC_EMPTY Src empty (do not retry)
++// NQ_NONE Enqueue not attempted
++// NQ_DRAINING At EOS, dQ dest until EOS there too
++// NQ_DEAD Not running (do not retry, do not attempt capture dQ)
++
++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
+ {
+- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+- V4L2Context *const capture = &s->capture;
+- V4L2Context *const output = &s->output;
+- AVPacket avpkt = {0};
+ int ret;
+
+- if (s->buf_pkt.size) {
+- avpkt = s->buf_pkt;
+- memset(&s->buf_pkt, 0, sizeof(AVPacket));
+- } else {
+- ret = ff_decode_get_packet(avctx, &avpkt);
+- if (ret < 0 && ret != AVERROR_EOF)
++ // If we don't already have a coded packet - get a new one
++ // We will already have a coded pkt if the output Q was full last time we
++ // tried to Q it
++ if (!s->buf_pkt.size && !do_not_get) {
++ unsigned int i;
++
++ for (i = 0; i < 256; ++i) {
++ uint8_t * side_data;
++#if FF_API_BUFFER_SIZE_T
++ int side_size;
++#else
++ size_t side_size;
++#endif
++ ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++ if (ret != 0)
++ break;
++
++ // New extradata is the only side-data we undertand
++ side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
++ if (side_data) {
++ av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
++ av_freep(&s->extdata_data);
++ if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d bytes of extra data\n", (int)side_size);
++ return AVERROR(ENOMEM);
++ }
++ memcpy(s->extdata_data, side_data, side_size);
++ s->extdata_size = side_size;
++ s->extdata_sent = 0;
++ }
++
++ if (s->buf_pkt.size != 0)
++ break;
++
++ if (s->buf_pkt.side_data_elems == 0) {
++ av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
++ ret = AVERROR_EOF;
++ break;
++ }
++
++ // Retry a side-data only pkt
++ }
++ // If i >= 256 something has gone wrong
++ if (i >= 256) {
++ av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
++ return AVERROR(EIO);
++ }
++
++ if (ret == AVERROR(EAGAIN)) {
++ if (!stream_started(s)) {
++ av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
++ return NQ_DEAD;
++ }
++ return NQ_SRC_EMPTY;
++ }
++
++ if (ret == AVERROR_EOF) {
++ // EOF - enter drain mode
++ av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
++ ret, s->buf_pkt.size, stream_started(s), s->draining);
++ if (!stream_started(s)) {
++ av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
++ s->draining = 1;
++ s->capture.done = 1;
++ return AVERROR_EOF;
++ }
++
++ if (!s->draining) {
++ // Calling enqueue with an empty pkt starts drain
++ av_assert0(s->buf_pkt.size == 0);
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++ if (ret) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
++ return ret;
++ }
++ }
++ return NQ_DRAINING;
++ }
++
++ if (ret < 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
+ return ret;
++ }
+ }
+
+- if (s->draining)
+- goto dequeue;
++ if (s->draining) {
++ if (s->buf_pkt.size) {
++ av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
++ av_packet_unref(&s->buf_pkt);
++ }
++ return NQ_DRAINING;
++ }
+
+- ret = ff_v4l2_context_enqueue_packet(output, &avpkt);
+- if (ret < 0) {
+- if (ret != AVERROR(EAGAIN))
+- return ret;
++ if (!s->buf_pkt.size)
++ return NQ_NONE;
+
+- s->buf_pkt = avpkt;
+- /* no input buffers available, continue dequeing */
+- }
++ if ((ret = check_output_streamon(avctx, s)) != 0)
++ return ret;
++
++ if (s->extdata_sent)
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++ else if (s->extdata_data)
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
++ else
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
++
++ if (ret == AVERROR(EAGAIN)) {
++ // Out of input buffers - keep packet
++ ret = NQ_Q_FULL;
++ }
++ else {
++ // In all other cases we are done with this packet
++ av_packet_unref(&s->buf_pkt);
++ s->extdata_sent = 1;
+
+- if (avpkt.size) {
+- ret = v4l2_try_start(avctx);
+ if (ret) {
+- av_packet_unref(&avpkt);
++ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
++ return ret;
++ }
++ }
+
+- /* cant recover */
+- if (ret == AVERROR(ENOMEM))
+- return ret;
++ // Start if we haven't
++ {
++ const int ret2 = v4l2_try_start(avctx);
++ if (ret2) {
++ av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
++ ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
++ }
++ }
++
++ return ret;
++}
++
++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
++{
++ int rv = 0;
+
+- return 0;
++ ff_mutex_lock(&ctx->lock);
++
++ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
++ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
++ rv = AVERROR(errno);
++ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
++ break;
+ }
+ }
+
+-dequeue:
+- if (!s->buf_pkt.size)
+- av_packet_unref(&avpkt);
+- return ff_v4l2_context_dequeue_frame(capture, frame, -1);
++ ff_mutex_unlock(&ctx->lock);
++ return rv;
++}
++
++// Number of frames over what xlat_pending returns that we keep *16
++// This is a min value - if it appears to be too small the threshold should
++// adjust dynamically.
++#define PENDING_HW_MIN (3 * 16)
++// Offset to use when setting dynamically
++// Set to %16 == 15 to avoid the threshold changing immediately as we relax
++#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1)
++// Number of consecutive times we've failed to get a frame when we prefer it
++// before we increase the prefer threshold (5ms * N = max expected decode
++// time)
++#define PENDING_N_THRESHOLD 6
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++ V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++ int src_rv = NQ_OK;
++ int dst_rv = 1; // Non-zero (done), non-negative (error) number
++ unsigned int i = 0;
++
++ do {
++ const int pending = xlat_pending(&s->xlat);
++ const int prefer_dq = (pending > s->pending_hw / 16);
++ const int last_src_rv = src_rv;
++
++ // Enqueue another pkt for decode if
++ // (a) We don't have a lot of stuff in the buffer already OR
++ // (b) ... we (think we) do but we've failed to get a frame already OR
++ // (c) We've dequeued a lot of frames without asking for input
++ src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
++
++ // If we got a frame last time or we've already tried to get a frame and
++ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
++ // indicating that we want more input.
++ // This should mean that once decode starts we enter a stable state where
++ // we alternately ask for input and produce output
++ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
++ break;
++
++ if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
++ av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
++ break;
++ }
++
++ // Try to get a new frame if
++ // (a) we haven't already got one AND
++ // (b) enqueue returned a status indicating that decode should be attempted
++ if (dst_rv != 0 && TRY_DQ(src_rv)) {
++ // Pick a timeout depending on state
++ const int t =
++ src_rv == NQ_DRAINING ? 300 :
++ prefer_dq ? 5 :
++ src_rv == NQ_Q_FULL ? -1 : 0;
++
++ // Dequeue frame will unref any previous contents of frame
++ // if it returns success so we don't need an explicit unref
++ // when discarding
++ // This returns AVERROR(EAGAIN) on timeout or if
++ // there is room in the input Q and timeout == -1
++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++
++ // Failure due to no buffer in Q?
++ if (dst_rv == AVERROR(ENOSPC)) {
++ // Wait & retry
++ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++ }
++ }
++
++ // Adjust dynamic pending threshold
++ if (dst_rv == 0) {
++ if (--s->pending_hw < PENDING_HW_MIN)
++ s->pending_hw = PENDING_HW_MIN;
++ s->pending_n = 0;
++
++ set_best_effort_pts(avctx, &s->pts_stat, frame);
++ }
++ else if (dst_rv == AVERROR(EAGAIN)) {
++ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
++ s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
++ s->pending_n = 0;
++ }
++ }
++
++ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
++ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
++ dst_rv = AVERROR_EOF;
++ s->capture.done = 1;
++ }
++ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
++ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
++ s->draining, s->capture.done);
++ else if (dst_rv && dst_rv != AVERROR(EAGAIN))
++ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
++ s->draining, s->capture.done, dst_rv);
++ }
++
++ ++i;
++ if (i >= 256) {
++ av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
++ src_rv = AVERROR(EIO);
++ }
++
++ // Continue trying to enqueue packets if either
++ // (a) we succeeded last time OR
++ // (b) we didn't ret a frame and we can retry the input
++ } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
++
++ // Ensure that the frame contains nothing if we aren't returning a frame
++ // (might happen when discarding)
++ if (dst_rv)
++ av_frame_unref(frame);
++
++ // If we got a frame this time ask for a pkt next time
++ s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
++
++#if 0
++ if (dst_rv == 0)
++ {
++ static int z = 0;
++ if (++z > 50) {
++ av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
++ ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++ return -1;
++ }
++ }
++#endif
++
++ return dst_rv == 0 ? 0 :
++ src_rv < 0 ? src_rv :
++ dst_rv < 0 ? dst_rv :
++ AVERROR(EAGAIN);
++}
++
++#if 0
++#include <time.h>
++static int64_t us_time(void)
++{
++ struct timespec ts;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++ int ret;
++ const int64_t now = us_time();
++ int64_t done;
++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++ ret = v4l2_receive_frame2(avctx, frame);
++ done = us_time();
++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
++ return ret;
++}
++#endif
++
++static int
++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++ unsigned int i;
++ const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
++ const uint32_t w = avctx->coded_width;
++ const uint32_t h = avctx->coded_height;
++
++ if (w == 0 || h == 0 || fcc == 0) {
++ av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
++ return 0;
++ }
++ if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
++ av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
++ return 0;
++ }
++
++ for (i = 0;; ++i) {
++ struct v4l2_frmsizeenum fs = {
++ .index = i,
++ .pixel_format = fcc,
++ };
++
++ while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
++ const int err = AVERROR(errno);
++ if (err == AVERROR(EINTR))
++ continue;
++ if (i == 0 && err == AVERROR(ENOTTY)) {
++ av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
++ return 0;
++ }
++ if (err != AVERROR(EINVAL)) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
++ return err;
++ }
++ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
++ w, h, av_fourcc2str(fcc), i);
++ return err;
++ }
++
++ switch (fs.type) {
++ case V4L2_FRMSIZE_TYPE_DISCRETE:
++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
++ fs.discrete.width,fs.discrete.height);
++ if (w == fs.discrete.width && h == fs.discrete.height)
++ return 0;
++ break;
++ case V4L2_FRMSIZE_TYPE_STEPWISE:
++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++ fs.stepwise.min_width, fs.stepwise.min_height,
++ fs.stepwise.max_width, fs.stepwise.max_height,
++ fs.stepwise.step_width,fs.stepwise.step_height);
++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
++ (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
++ (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
++ return 0;
++ break;
++ case V4L2_FRMSIZE_TYPE_CONTINUOUS:
++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++ fs.stepwise.min_width, fs.stepwise.min_height,
++ fs.stepwise.max_width, fs.stepwise.max_height,
++ fs.stepwise.step_width,fs.stepwise.step_height);
++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
++ return 0;
++ break;
++ default:
++ av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
++ return AVERROR(EINVAL);
++ }
++ }
++}
++
++static int
++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++ struct v4l2_capability cap;
++
++ memset(&cap, 0, sizeof(cap));
++ while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
++ int err = errno;
++ if (err == EINTR)
++ continue;
++ av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
++ return AVERROR(err);
++ }
++
++ // Could be made table driven if we have a few more but right now there
++ // seems no point
++
++ // Meson (amlogic) always gives a resolution changed event after output
++ // streamon and userspace must (re)allocate capture buffers and streamon
++ // capture to clear the event even if the capture buffers were the right
++ // size in the first place.
++ if (strcmp(cap.driver, "meson-vdec") == 0)
++ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
++
++ av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
++ return 0;
++}
++
++// This heuristic is for H264 but use for everything
++static uint32_t max_coded_size(const AVCodecContext * const avctx)
++{
++ uint32_t wxh = avctx->coded_width * avctx->coded_height;
++ uint32_t size;
++
++ size = wxh * 3 / 2;
++ // H.264 Annex A table A-1 gives minCR which is either 2 or 4
++ // unfortunately that doesn't yield an actually useful limit
++ // and it should be noted that frame 0 is special cased to allow
++ // a bigger number which really isn't helpful for us. So just pick
++ // frame_size / 2
++ size /= 2;
++ // Add 64k to allow for any overheads and/or encoder hopefulness
++ // with small WxH
++ return size + (1 << 16);
+ }
+
+ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -186,12 +699,29 @@ static av_cold int v4l2_decode_init(AVCo
+ V4L2Context *capture, *output;
+ V4L2m2mContext *s;
+ V4L2m2mPriv *priv = avctx->priv_data;
++ int gf_pix_fmt;
+ int ret;
+
++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++ if (avctx->codec_id == AV_CODEC_ID_H264) {
++ if (avctx->ticks_per_frame == 1) {
++ if(avctx->time_base.den < INT_MAX/2) {
++ avctx->time_base.den *= 2;
++ } else
++ avctx->time_base.num /= 2;
++ }
++ avctx->ticks_per_frame = 2;
++ }
++
++ av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
+ ret = ff_v4l2_m2m_create_context(priv, &s);
+ if (ret < 0)
+ return ret;
+
++ pts_stats_init(&s->pts_stat, avctx, "decoder");
++ s->pending_hw = PENDING_HW_MIN;
++
+ capture = &s->capture;
+ output = &s->output;
+
+@@ -199,34 +729,127 @@ static av_cold int v4l2_decode_init(AVCo
+ * by the v4l2 driver; this event will trigger a full pipeline reconfig and
+ * the proper values will be retrieved from the kernel driver.
+ */
+- output->height = capture->height = avctx->coded_height;
+- output->width = capture->width = avctx->coded_width;
++// output->height = capture->height = avctx->coded_height;
++// output->width = capture->width = avctx->coded_width;
++ output->height = capture->height = 0;
++ output->width = capture->width = 0;
+
+ output->av_codec_id = avctx->codec_id;
+ output->av_pix_fmt = AV_PIX_FMT_NONE;
++ output->min_buf_size = max_coded_size(avctx);
+
+ capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+ capture->av_pix_fmt = avctx->pix_fmt;
++ capture->min_buf_size = 0;
++
++ /* the client requests the codec to generate DRM frames:
++ * - data[0] will therefore point to the returned AVDRMFrameDescriptor
++ * check the ff_v4l2_buffer_to_avframe conversion function.
++ * - the DRM frame format is passed in the DRM frame descriptor layer.
++ * check the v4l2_get_drm_frame function.
++ */
++
++ avctx->sw_pix_fmt = avctx->pix_fmt;
++ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
++ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
++ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
++ avctx->coded_width, avctx->coded_height,
++ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
++
++ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++ s->output_drm = 1;
++ }
++ else {
++ capture->av_pix_fmt = gf_pix_fmt;
++ s->output_drm = 0;
++ }
++
++ s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
++ if (!s->device_ref) {
++ ret = AVERROR(ENOMEM);
++ return ret;
++ }
++
++ ret = av_hwdevice_ctx_init(s->device_ref);
++ if (ret < 0)
++ return ret;
+
+ s->avctx = avctx;
+ ret = ff_v4l2_m2m_codec_init(priv);
+ if (ret) {
+ av_log(avctx, AV_LOG_ERROR, "can't configure decoder\n");
+- s->self_ref = NULL;
+- av_buffer_unref(&priv->context_ref);
+-
+ return ret;
+ }
+
+- return v4l2_prepare_decoder(s);
++ if ((ret = v4l2_prepare_decoder(s)) < 0)
++ return ret;
++
++ if ((ret = get_quirks(avctx, s)) != 0)
++ return ret;
++
++ if ((ret = check_size(avctx, s)) != 0)
++ return ret;
++
++ return 0;
+ }
+
+ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+ {
+- V4L2m2mPriv *priv = avctx->priv_data;
+- V4L2m2mContext *s = priv->context;
++ int rv;
++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++ rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
++ return rv;
++}
++
++static void v4l2_decode_flush(AVCodecContext *avctx)
++{
++ // An alternatve and more drastic form of flush is to simply do this:
++ // v4l2_decode_close(avctx);
++ // v4l2_decode_init(avctx);
++ // The downside is that this keeps a decoder open until all the frames
++ // associated with it have been returned. This is a bit wasteful on
++ // possibly limited h/w resources and fails on a Pi for this reason unless
++ // more GPU mem is allocated than is the default.
++
++ V4L2m2mPriv * const priv = avctx->priv_data;
++ V4L2m2mContext * const s = priv->context;
++ V4L2Context * const output = &s->output;
++ V4L2Context * const capture = &s->capture;
++
++ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
++
++ // Reflushing everything is benign, quick and avoids having to worry about
++ // states like EOS processing so don't try to optimize out (having got it
++ // wrong once)
++
++ ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
++
++ // Clear any buffered input packet
+ av_packet_unref(&s->buf_pkt);
+- return ff_v4l2_m2m_codec_end(priv);
++
++ // Clear a pending EOS
++ if (ff_v4l2_ctx_eos(capture)) {
++ // Arguably we could delay this but this is easy and doesn't require
++ // thought or extra vars
++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
++ }
++
++ // V4L2 makes no guarantees about whether decoded frames are flushed or not
++ // so mark all frames we are tracking to be discarded if they appear
++ xlat_flush(&s->xlat);
++
++ // resend extradata
++ s->extdata_sent = 0;
++ // clear EOS status vars
++ s->draining = 0;
++ output->done = 0;
++ capture->done = 0;
++
++ // Stream on will occur when we actually submit a new frame
++ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
+ }
+
+ #define OFFSET(x) offsetof(V4L2m2mPriv, x)
+@@ -235,10 +858,16 @@ static av_cold int v4l2_decode_close(AVC
+ static const AVOption options[] = {
+ V4L_M2M_DEFAULT_OPTS,
+ { "num_capture_buffers", "Number of buffers in the capture context",
+- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
++ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
++ { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
+ { NULL},
+ };
+
++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
++ HW_CONFIG_INTERNAL(DRM_PRIME),
++ NULL
++};
++
+ #define M2MDEC_CLASS(NAME) \
+ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
+ .class_name = #NAME "_v4l2m2m_decoder", \
+@@ -259,9 +888,15 @@ static const AVOption options[] = {
+ .init = v4l2_decode_init, \
+ .receive_frame = v4l2_receive_frame, \
+ .close = v4l2_decode_close, \
++ .flush = v4l2_decode_flush, \
+ .bsfs = bsf_name, \
+ .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+- .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS, \
++ .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
++ .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
++ AV_PIX_FMT_NV12, \
++ AV_PIX_FMT_YUV420P, \
++ AV_PIX_FMT_NONE}, \
++ .hw_configs = v4l2_m2m_hw_configs, \
+ .wrapper_name = "v4l2m2m", \
+ }
+
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -24,6 +24,8 @@
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <search.h>
++#include <drm_fourcc.h>
++
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "libavutil/pixdesc.h"
+@@ -37,6 +39,34 @@
+ #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+ #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in videodev2.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
+ static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+ {
+ struct v4l2_streamparm parm = { 0 };
+@@ -147,15 +177,14 @@ static inline int v4l2_mpeg4_profile_fro
+ static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+ {
+ if (s->avctx->max_b_frames)
+- av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
++ av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
+
+- v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
++ v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
+ v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
+ if (s->avctx->max_b_frames == 0)
+ return 0;
+
+ avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+-
+ return AVERROR_PATCHWELCOME;
+ }
+
+@@ -270,13 +299,184 @@ static int v4l2_prepare_encoder(V4L2m2mC
+ return 0;
+ }
+
++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
++{
++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++ const uint32_t drm_fmt = src->layers[0].format;
++ // Treat INVALID as LINEAR
++ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++ uint32_t pix_fmt = 0;
++ uint32_t w = 0;
++ uint32_t h = 0;
++ uint32_t bpl = src->layers[0].planes[0].pitch;
++
++ // We really don't expect multiple layers
++ // All formats that we currently cope with are single object
++
++ if (src->nb_layers != 1 || src->nb_objects != 1)
++ return AVERROR(EINVAL);
++
++ switch (drm_fmt) {
++ case DRM_FORMAT_YUV420:
++ if (mod == DRM_FORMAT_MOD_LINEAR) {
++ if (src->layers[0].nb_planes != 3)
++ break;
++ pix_fmt = V4L2_PIX_FMT_YUV420;
++ h = src->layers[0].planes[1].offset / bpl;
++ w = bpl;
++ }
++ break;
++
++ case DRM_FORMAT_NV12:
++ if (mod == DRM_FORMAT_MOD_LINEAR) {
++ if (src->layers[0].nb_planes != 2)
++ break;
++ pix_fmt = V4L2_PIX_FMT_NV12;
++ h = src->layers[0].planes[1].offset / bpl;
++ w = bpl;
++ }
++ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++ if (src->layers[0].nb_planes != 2)
++ break;
++ pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++ w = bpl;
++ h = src->layers[0].planes[1].offset / 128;
++ bpl = fourcc_mod_broadcom_param(mod);
++ }
++ break;
++
++ case DRM_FORMAT_P030:
++ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++ if (src->layers[0].nb_planes != 2)
++ break;
++ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128;
++ w = bpl / 2; // Matching lie to how we construct this
++ h = src->layers[0].planes[1].offset / 128;
++ bpl = fourcc_mod_broadcom_param(mod);
++ }
++ break;
++
++ default:
++ break;
++ }
++
++ if (!pix_fmt)
++ return AVERROR(EINVAL);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++ pix->width = w;
++ pix->height = h;
++ pix->pixelformat = pix_fmt;
++ pix->plane_fmt[0].bytesperline = bpl;
++ pix->num_planes = 1;
++ }
++ else {
++ struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++ pix->width = w;
++ pix->height = h;
++ pix->pixelformat = pix_fmt;
++ pix->bytesperline = bpl;
++ }
++
++ return 0;
++}
++
++// Do we have similar enough formats to be usable?
++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
++{
++ if (a->type != b->type)
++ return 0;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
++ const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
++ const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
++ unsigned int i;
++ if (pa->pixelformat != pb->pixelformat ||
++ pa->num_planes != pb->num_planes)
++ return 0;
++ for (i = 0; i != pa->num_planes; ++i) {
++ if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
++ return 0;
++ }
++ }
++ else {
++ const struct v4l2_pix_format *const pa = &a->fmt.pix;
++ const struct v4l2_pix_format *const pb = &b->fmt.pix;
++ if (pa->pixelformat != pb->pixelformat ||
++ pa->bytesperline != pb->bytesperline)
++ return 0;
++ }
++ return 1;
++}
++
++
+ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+ {
+ V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+ V4L2Context *const output = &s->output;
+
++ // Signal EOF if needed
++ if (!frame) {
++ return ff_v4l2_context_enqueue_frame(output, frame);
++ }
++
++ if (s->input_drm && !output->streamon) {
++ int rv;
++ struct v4l2_format req_format = {.type = output->format.type};
++
++ // Set format when we first get a buffer
++ if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
++ return rv;
++ }
++
++ ff_v4l2_context_release(output);
++
++ output->format = req_format;
++
++ if ((rv = ff_v4l2_context_set_format(output)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
++ return rv;
++ }
++
++ if (!fmt_eq(&req_format, &output->format)) {
++ av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
++ return AVERROR(EINVAL);
++ }
++
++ output->selection.top = frame->crop_top;
++ output->selection.left = frame->crop_left;
++ output->selection.width = av_frame_cropped_width(frame);
++ output->selection.height = av_frame_cropped_height(frame);
++
++ if ((rv = ff_v4l2_context_init(output)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
++ return rv;
++ }
++
++ {
++ struct v4l2_selection selection = {
++ .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
++ .target = V4L2_SEL_TGT_CROP,
++ .r = output->selection
++ };
++ if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
++ av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
++ selection.r.width, selection.r.height, selection.r.left, selection.r.top,
++ av_err2str(AVERROR(errno)));
++ }
++ av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
++ selection.r.width, selection.r.height, selection.r.left, selection.r.top);
++ }
++ }
++
+ #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
+- if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
++ if (frame->pict_type == AV_PICTURE_TYPE_I)
+ v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
+ #endif
+
+@@ -310,7 +510,70 @@ static int v4l2_receive_packet(AVCodecCo
+ }
+
+ dequeue:
+- return ff_v4l2_context_dequeue_packet(capture, avpkt);
++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++ return ret;
++
++ if (capture->first_buf == 1) {
++ uint8_t * data;
++ const int len = avpkt->size;
++
++ // 1st buffer after streamon should be SPS/PPS
++ capture->first_buf = 2;
++
++ // Clear both possible stores so there is no chance of confusion
++ av_freep(&s->extdata_data);
++ s->extdata_size = 0;
++ av_freep(&avctx->extradata);
++ avctx->extradata_size = 0;
++
++ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
++ memcpy(data, avpkt->data, len);
++
++ av_packet_unref(avpkt);
++
++ if (data == NULL)
++ return AVERROR(ENOMEM);
++
++ // We need to copy the header, but keep local if not global
++ if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
++ avctx->extradata = data;
++ avctx->extradata_size = len;
++ }
++ else {
++ s->extdata_data = data;
++ s->extdata_size = len;
++ }
++
++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++ return ret;
++ }
++
++ // First frame must be key so mark as such even if encoder forgot
++ if (capture->first_buf == 2)
++ avpkt->flags |= AV_PKT_FLAG_KEY;
++
++ // Add SPS/PPS to the start of every key frame if non-global headers
++ if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
++ const size_t newlen = s->extdata_size + avpkt->size;
++ AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
++
++ if (buf == NULL) {
++ av_packet_unref(avpkt);
++ return AVERROR(ENOMEM);
++ }
++
++ memcpy(buf->data, s->extdata_data, s->extdata_size);
++ memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
++
++ av_buffer_unref(&avpkt->buf);
++ avpkt->buf = buf;
++ avpkt->data = buf->data;
++ avpkt->size = newlen;
++ }
++
++// av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
++ capture->first_buf = 0;
++ return 0;
+ }
+
+ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+@@ -322,6 +585,8 @@ static av_cold int v4l2_encode_init(AVCo
+ uint32_t v4l2_fmt_output;
+ int ret;
+
++ av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
++
+ ret = ff_v4l2_m2m_create_context(priv, &s);
+ if (ret < 0)
+ return ret;
+@@ -329,13 +594,17 @@ static av_cold int v4l2_encode_init(AVCo
+ capture = &s->capture;
+ output = &s->output;
+
++ s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
++
+ /* common settings output/capture */
+ output->height = capture->height = avctx->height;
+ output->width = capture->width = avctx->width;
+
+ /* output context */
+ output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+- output->av_pix_fmt = avctx->pix_fmt;
++ output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
++ avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
++ AV_PIX_FMT_YUV420P;
+
+ /* capture context */
+ capture->av_codec_id = avctx->codec_id;
+@@ -354,7 +623,7 @@ static av_cold int v4l2_encode_init(AVCo
+ v4l2_fmt_output = output->format.fmt.pix.pixelformat;
+
+ pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
+- if (pix_fmt_output != avctx->pix_fmt) {
++ if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
+ av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
+ return AVERROR(EINVAL);
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.c
+@@ -0,0 +1,84 @@
++#include <memory.h>
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_decode_q.h"
++
++int decode_q_in_q(const req_decode_ent * const d)
++{
++ return d->in_q;
++}
++
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
++{
++ pthread_mutex_lock(&q->q_lock);
++ if (!q->head) {
++ q->head = d;
++ q->tail = d;
++ d->prev = NULL;
++ }
++ else {
++ q->tail->next = d;
++ d->prev = q->tail;
++ q->tail = d;
++ }
++ d->next = NULL;
++ d->in_q = 1;
++ pthread_mutex_unlock(&q->q_lock);
++}
++
++// Remove entry from Q - if head wake-up anything that was waiting
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
++{
++ int try_signal = 0;
++
++ if (!d->in_q)
++ return;
++
++ pthread_mutex_lock(&q->q_lock);
++ if (d->prev)
++ d->prev->next = d->next;
++ else {
++ try_signal = 1; // Only need to signal if we were head
++ q->head = d->next;
++ }
++
++ if (d->next)
++ d->next->prev = d->prev;
++ else
++ q->tail = d->prev;
++
++ // Not strictly needed but makes debug easier
++ d->next = NULL;
++ d->prev = NULL;
++ d->in_q = 0;
++ pthread_mutex_unlock(&q->q_lock);
++
++ if (try_signal)
++ pthread_cond_broadcast(&q->q_cond);
++}
++
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
++{
++ pthread_mutex_lock(&q->q_lock);
++
++ while (q->head != d)
++ pthread_cond_wait(&q->q_cond, &q->q_lock);
++
++ pthread_mutex_unlock(&q->q_lock);
++}
++
++void decode_q_uninit(req_decode_q * const q)
++{
++ pthread_mutex_destroy(&q->q_lock);
++ pthread_cond_destroy(&q->q_cond);
++}
++
++void decode_q_init(req_decode_q * const q)
++{
++ memset(q, 0, sizeof(*q));
++ pthread_mutex_init(&q->q_lock, NULL);
++ pthread_cond_init(&q->q_cond, NULL);
++}
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.h
+@@ -0,0 +1,25 @@
++#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
++#define AVCODEC_V4L2_REQ_DECODE_Q_H
++
++typedef struct req_decode_ent {
++ struct req_decode_ent * next;
++ struct req_decode_ent * prev;
++ int in_q;
++} req_decode_ent;
++
++typedef struct req_decode_q {
++ pthread_mutex_t q_lock;
++ pthread_cond_t q_cond;
++ req_decode_ent * head;
++ req_decode_ent * tail;
++} req_decode_q;
++
++int decode_q_in_q(const req_decode_ent * const d);
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_uninit(req_decode_q * const q);
++void decode_q_init(req_decode_q * const q);
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.c
+@@ -0,0 +1,449 @@
++#include <errno.h>
++#include <fcntl.h>
++#include <libudev.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++
++#include <sys/ioctl.h>
++#include <sys/sysmacros.h>
++
++#include <linux/media.h>
++#include <linux/videodev2.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_utils.h"
++
++struct decdev {
++ enum v4l2_buf_type src_type;
++ uint32_t src_fmt_v4l2;
++ const char * vname;
++ const char * mname;
++};
++
++struct devscan {
++ struct decdev env;
++ unsigned int dev_size;
++ unsigned int dev_count;
++ struct decdev *devs;
++};
++
++static int video_src_pixfmt_supported(uint32_t fmt)
++{
++ return 1;
++}
++
++static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
++ unsigned int width, unsigned int height,
++ unsigned int pixelformat)
++{
++ unsigned int sizeimage;
++
++ memset(format, 0, sizeof(*format));
++ format->type = type;
++
++ sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++ format->fmt.pix_mp.width = width;
++ format->fmt.pix_mp.height = height;
++ format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
++ format->fmt.pix_mp.pixelformat = pixelformat;
++ } else {
++ format->fmt.pix.width = width;
++ format->fmt.pix.height = height;
++ format->fmt.pix.sizeimage = sizeimage;
++ format->fmt.pix.pixelformat = pixelformat;
++ }
++}
++
++static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
++ unsigned int width, unsigned int height)
++{
++ struct v4l2_format format;
++
++ v4l2_setup_format(&format, type, width, height, pixelformat);
++
++ return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
++}
++
++static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
++{
++ struct v4l2_capability capability = { 0 };
++ int rc;
++
++ rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
++ if (rc < 0)
++ return -errno;
++
++ if (capabilities != NULL) {
++ if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
++ *capabilities = capability.device_caps;
++ else
++ *capabilities = capability.capabilities;
++ }
++
++ return 0;
++}
++
++static int devscan_add(struct devscan *const scan,
++ enum v4l2_buf_type src_type,
++ uint32_t src_fmt_v4l2,
++ const char * vname,
++ const char * mname)
++{
++ struct decdev *d;
++
++ if (scan->dev_size <= scan->dev_count) {
++ unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
++ d = realloc(scan->devs, n * sizeof(*d));
++ if (!d)
++ return -ENOMEM;
++ scan->devs = d;
++ scan->dev_size = n;
++ }
++
++ d = scan->devs + scan->dev_count;
++ d->src_type = src_type;
++ d->src_fmt_v4l2 = src_fmt_v4l2;
++ d->vname = strdup(vname);
++ if (!d->vname)
++ return -ENOMEM;
++ d->mname = strdup(mname);
++ if (!d->mname) {
++ free((char *)d->vname);
++ return -ENOMEM;
++ }
++ ++scan->dev_count;
++ return 0;
++}
++
++void devscan_delete(struct devscan **const pScan)
++{
++ unsigned int i;
++ struct devscan * const scan = *pScan;
++
++ if (!scan)
++ return;
++ *pScan = NULL;
++
++ for (i = 0; i < scan->dev_count; ++i) {
++ free((char*)scan->devs[i].mname);
++ free((char*)scan->devs[i].vname);
++ }
++ free(scan->devs);
++ free(scan);
++}
++
++#define REQ_BUF_CAPS (\
++ V4L2_BUF_CAP_SUPPORTS_DMABUF |\
++ V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
++ V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
++
++static void probe_formats(void * const dc,
++ struct devscan *const scan,
++ const int fd,
++ const unsigned int type_v4l2,
++ const char *const mpath,
++ const char *const vpath)
++{
++ unsigned int i;
++ for (i = 0;; ++i) {
++ struct v4l2_fmtdesc fmtdesc = {
++ .index = i,
++ .type = type_v4l2
++ };
++ struct v4l2_requestbuffers rbufs = {
++ .count = 0,
++ .type = type_v4l2,
++ .memory = V4L2_MEMORY_MMAP
++ };
++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++ if (errno == EINTR)
++ continue;
++ if (errno != EINVAL)
++ request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
++ return;
++ }
++ if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
++ continue;
++
++ if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
++ request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
++ continue;
++ }
++
++ while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
++ if (errno != EINTR) {
++ request_debug(dc, "%s: Reqbufs failed\n", vpath);
++ continue;
++ }
++ }
++
++ if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
++ request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
++ continue;
++ }
++
++ request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
++ mpath, vpath, fmtdesc.pixelformat, type_v4l2);
++ devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
++ }
++}
++
++
++static int probe_video_device(void * const dc,
++ struct udev_device *const device,
++ struct devscan *const scan,
++ const char *const mpath)
++{
++ int ret;
++ unsigned int capabilities = 0;
++ int video_fd = -1;
++
++ const char *path = udev_device_get_devnode(device);
++ if (!path) {
++ request_err(dc, "%s: get video device devnode failed\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ video_fd = open(path, O_RDWR, 0);
++ if (video_fd == -1) {
++ ret = -errno;
++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++ goto fail;
++ }
++
++ ret = v4l2_query_capabilities(video_fd, &capabilities);
++ if (ret < 0) {
++ request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
++
++ if (!(capabilities & V4L2_CAP_STREAMING)) {
++ request_debug(dc, "%s: missing required streaming capability\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
++ request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ /* Should check capture formats too... */
++ if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
++ if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
++
++ close(video_fd);
++ return 0;
++
++fail:
++ if (video_fd >= 0)
++ close(video_fd);
++ return ret;
++}
++
++static int probe_media_device(void * const dc,
++ struct udev_device *const device,
++ struct devscan *const scan)
++{
++ int ret;
++ int rv;
++ struct media_device_info device_info = { 0 };
++ struct media_v2_topology topology = { 0 };
++ struct media_v2_interface *interfaces = NULL;
++ struct udev *udev = udev_device_get_udev(device);
++ struct udev_device *video_device;
++ dev_t devnum;
++ int media_fd = -1;
++
++ const char *path = udev_device_get_devnode(device);
++ if (!path) {
++ request_err(dc, "%s: get media device devnode failed\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ media_fd = open(path, O_RDWR, 0);
++ if (media_fd < 0) {
++ ret = -errno;
++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
++ if (rv < 0) {
++ ret = -errno;
++ request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++ if (rv < 0) {
++ ret = -errno;
++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ if (topology.num_interfaces <= 0) {
++ request_err(dc, "%s: media device has no interfaces\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
++ if (!interfaces) {
++ request_err(dc, "%s: allocating media interface struct failed\n", __func__);
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++ if (rv < 0) {
++ ret = -errno;
++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ for (int i = 0; i < topology.num_interfaces; i++) {
++ if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
++ continue;
++
++ devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
++ video_device = udev_device_new_from_devnum(udev, 'c', devnum);
++ if (!video_device) {
++ ret = -errno;
++ request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
++ continue;
++ }
++
++ ret = probe_video_device(dc, video_device, scan, path);
++ udev_device_unref(video_device);
++
++ if (ret != 0)
++ goto fail;
++ }
++
++fail:
++ free(interfaces);
++ if (media_fd != -1)
++ close(media_fd);
++ return ret;
++}
++
++const char *decdev_media_path(const struct decdev *const dev)
++{
++ return !dev ? NULL : dev->mname;
++}
++
++const char *decdev_video_path(const struct decdev *const dev)
++{
++ return !dev ? NULL : dev->vname;
++}
++
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
++{
++ return !dev ? 0 : dev->src_type;
++}
++
++uint32_t decdev_src_pixelformat(const struct decdev *const dev)
++{
++ return !dev ? 0 : dev->src_fmt_v4l2;
++}
++
++
++const struct decdev *devscan_find(struct devscan *const scan,
++ const uint32_t src_fmt_v4l2)
++{
++ unsigned int i;
++
++ if (scan->env.mname && scan->env.vname)
++ return &scan->env;
++
++ if (!src_fmt_v4l2)
++ return scan->dev_count ? scan->devs + 0 : NULL;
++
++ for (i = 0; i != scan->dev_count; ++i) {
++ if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
++ return scan->devs + i;
++ }
++ return NULL;
++}
++
++int devscan_build(void * const dc, struct devscan **pscan)
++{
++ int ret;
++ struct udev *udev;
++ struct udev_enumerate *enumerate;
++ struct udev_list_entry *devices;
++ struct udev_list_entry *entry;
++ struct udev_device *device;
++ struct devscan * scan;
++
++ *pscan = NULL;
++
++ scan = calloc(1, sizeof(*scan));
++ if (!scan) {
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
++ scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
++ if (scan->env.mname && scan->env.vname) {
++ request_info(dc, "Media/video device env overrides found: %s,%s\n",
++ scan->env.mname, scan->env.vname);
++ *pscan = scan;
++ return 0;
++ }
++
++ udev = udev_new();
++ if (!udev) {
++ request_err(dc, "%s: allocating udev context failed\n", __func__);
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ enumerate = udev_enumerate_new(udev);
++ if (!enumerate) {
++ request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ udev_enumerate_add_match_subsystem(enumerate, "media");
++ udev_enumerate_scan_devices(enumerate);
++
++ devices = udev_enumerate_get_list_entry(enumerate);
++ udev_list_entry_foreach(entry, devices) {
++ const char *path = udev_list_entry_get_name(entry);
++ if (!path)
++ continue;
++
++ device = udev_device_new_from_syspath(udev, path);
++ if (!device)
++ continue;
++
++ probe_media_device(dc, device, scan);
++ udev_device_unref(device);
++ }
++
++ udev_enumerate_unref(enumerate);
++
++ *pscan = scan;
++ return 0;
++
++fail:
++ udev_unref(udev);
++ devscan_delete(&scan);
++ return ret;
++}
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.h
+@@ -0,0 +1,21 @@
++#ifndef _DEVSCAN_H_
++#define _DEVSCAN_H_
++
++struct devscan;
++struct decdev;
++enum v4l2_buf_type;
++
++/* These return pointers to data in the devscan structure and so are vaild
++ * for the lifetime of that
++ */
++const char *decdev_media_path(const struct decdev *const dev);
++const char *decdev_video_path(const struct decdev *const dev);
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
++uint32_t decdev_src_pixelformat(const struct decdev *const dev);
++
++const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
++
++int devscan_build(void * const dc, struct devscan **pscan);
++void devscan_delete(struct devscan **const pScan);
++
++#endif
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.c
+@@ -0,0 +1,266 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <inttypes.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <linux/mman.h>
++#include <linux/dma-buf.h>
++#include <linux/dma-heap.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_utils.h"
++
++#define DMABUF_NAME1 "/dev/dma_heap/linux,cma"
++#define DMABUF_NAME2 "/dev/dma_heap/reserved"
++
++#define TRACE_ALLOC 0
++
++struct dmabufs_ctl {
++ int fd;
++ size_t page_size;
++};
++
++struct dmabuf_h {
++ int fd;
++ size_t size;
++ size_t len;
++ void * mapptr;
++};
++
++#if TRACE_ALLOC
++static unsigned int total_bufs = 0;
++static size_t total_size = 0;
++#endif
++
++struct dmabuf_h * dmabuf_import(int fd, size_t size)
++{
++ struct dmabuf_h *dh;
++
++ fd = dup(fd);
++ if (fd < 0 || size == 0)
++ return NULL;
++
++ dh = malloc(sizeof(*dh));
++ if (!dh) {
++ close(fd);
++ return NULL;
++ }
++
++ *dh = (struct dmabuf_h) {
++ .fd = fd,
++ .size = size,
++ .mapptr = MAP_FAILED
++ };
++
++#if TRACE_ALLOC
++ ++total_bufs;
++ total_size += dh->size;
++ request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++ return dh;
++}
++
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
++{
++ struct dmabuf_h * dh;
++ struct dma_heap_allocation_data data = {
++ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
++ .fd = 0,
++ .fd_flags = O_RDWR,
++ .heap_flags = 0
++ };
++
++ if (old != NULL) {
++ if (old->size == data.len) {
++ return old;
++ }
++ dmabuf_free(old);
++ }
++
++ if (size == 0 ||
++ (dh = malloc(sizeof(*dh))) == NULL)
++ return NULL;
++
++ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
++ int err = errno;
++ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
++ (uint64_t)data.len,
++ dbsc->fd,
++ err,
++ strerror(err));
++ if (err == EINTR)
++ continue;
++ goto fail;
++ }
++
++ *dh = (struct dmabuf_h){
++ .fd = data.fd,
++ .size = (size_t)data.len,
++ .mapptr = MAP_FAILED
++ };
++
++#if TRACE_ALLOC
++ ++total_bufs;
++ total_size += dh->size;
++ request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++ return dh;
++
++fail:
++ free(dh);
++ return NULL;
++}
++
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
++{
++ struct dma_buf_sync sync = {
++ .flags = flags
++ };
++ while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
++ const int err = errno;
++ if (errno == EINTR)
++ continue;
++ request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
++ return -err;
++ }
++ return 0;
++}
++
++int dmabuf_write_start(struct dmabuf_h * const dh)
++{
++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_write_end(struct dmabuf_h * const dh)
++{
++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_read_start(struct dmabuf_h * const dh)
++{
++ if (!dmabuf_map(dh))
++ return -1;
++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
++}
++
++int dmabuf_read_end(struct dmabuf_h * const dh)
++{
++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
++}
++
++
++void * dmabuf_map(struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return NULL;
++ if (dh->mapptr != MAP_FAILED)
++ return dh->mapptr;
++ dh->mapptr = mmap(NULL, dh->size,
++ PROT_READ | PROT_WRITE,
++ MAP_SHARED | MAP_POPULATE,
++ dh->fd, 0);
++ if (dh->mapptr == MAP_FAILED) {
++ request_log("%s: Map failed\n", __func__);
++ return NULL;
++ }
++ return dh->mapptr;
++}
++
++int dmabuf_fd(const struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return -1;
++ return dh->fd;
++}
++
++size_t dmabuf_size(const struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return 0;
++ return dh->size;
++}
++
++size_t dmabuf_len(const struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return 0;
++ return dh->len;
++}
++
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
++{
++ dh->len = len;
++}
++
++
++
++void dmabuf_free(struct dmabuf_h * dh)
++{
++ if (!dh)
++ return;
++
++#if TRACE_ALLOC
++ --total_bufs;
++ total_size -= dh->size;
++ request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++ if (dh->mapptr != MAP_FAILED)
++ munmap(dh->mapptr, dh->size);
++ while (close(dh->fd) == -1 && errno == EINTR)
++ /* loop */;
++ free(dh);
++}
++
++struct dmabufs_ctl * dmabufs_ctl_new(void)
++{
++ struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
++
++ if (!dbsc)
++ return NULL;
++
++ while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
++ errno == EINTR)
++ /* Loop */;
++
++ if (dbsc->fd == -1) {
++ while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
++ errno == EINTR)
++ /* Loop */;
++ if (dbsc->fd == -1) {
++ request_log("Unable to open either %s or %s\n",
++ DMABUF_NAME1, DMABUF_NAME2);
++ goto fail;
++ }
++ }
++
++ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
++
++ return dbsc;
++
++fail:
++ free(dbsc);
++ return NULL;
++}
++
++void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
++{
++ struct dmabufs_ctl * const dbsc = *pDbsc;
++
++ if (!dbsc)
++ return;
++ *pDbsc = NULL;
++
++ while (close(dbsc->fd) == -1 && errno == EINTR)
++ /* loop */;
++
++ free(dbsc);
++}
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.h
+@@ -0,0 +1,38 @@
++#ifndef DMABUFS_H
++#define DMABUFS_H
++
++struct dmabufs_ctl;
++struct dmabuf_h;
++
++struct dmabufs_ctl * dmabufs_ctl_new(void);
++void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
++
++// Need not preserve old contents
++// On NULL return old buffer is freed
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
++
++static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
++ return dmabuf_realloc(dbsc, NULL, size);
++}
++/* Create from existing fd - dups(fd) */
++struct dmabuf_h * dmabuf_import(int fd, size_t size);
++void * dmabuf_map(struct dmabuf_h * const dh);
++
++/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
++
++int dmabuf_write_start(struct dmabuf_h * const dh);
++int dmabuf_write_end(struct dmabuf_h * const dh);
++int dmabuf_read_start(struct dmabuf_h * const dh);
++int dmabuf_read_end(struct dmabuf_h * const dh);
++
++int dmabuf_fd(const struct dmabuf_h * const dh);
++/* Allocated size */
++size_t dmabuf_size(const struct dmabuf_h * const dh);
++/* Bytes in use */
++size_t dmabuf_len(const struct dmabuf_h * const dh);
++/* Set bytes in use */
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
++void dmabuf_free(struct dmabuf_h * dh);
++
++#endif
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v1.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 1
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v2.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 2
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v3.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 3
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v4.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 4
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_vx.c
+@@ -0,0 +1,1365 @@
++// File included by v4l2_req_hevc_v* - not compiled on its own
++
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++
++#if HEVC_CTRLS_VERSION == 1
++#include "hevc-ctrls-v1.h"
++
++// Fixup renamed entries
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
++
++#elif HEVC_CTRLS_VERSION == 2
++#include "hevc-ctrls-v2.h"
++#elif HEVC_CTRLS_VERSION == 3
++#include "hevc-ctrls-v3.h"
++#elif HEVC_CTRLS_VERSION == 4
++#include <linux/v4l2-controls.h>
++#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
++#include "hevc-ctrls-v4.h"
++#endif
++#else
++#error Unknown HEVC_CTRLS_VERSION
++#endif
++
++#ifndef V4L2_CID_STATELESS_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_SPS V4L2_CID_MPEG_VIDEO_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_PPS V4L2_CID_MPEG_VIDEO_HEVC_PPS
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
++#define V4L2_CID_STATELESS_HEVC_START_CODE V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
++
++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
++#define V4L2_STATELESS_HEVC_START_CODE_NONE V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
++#endif
++
++// Should be in videodev2 but we might not have a good enough one
++#ifndef V4L2_PIX_FMT_HEVC_SLICE
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++#endif
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++// Attached to buf[0] in frame
++// Pooled in hwcontext so generally create once - 1/frame
++typedef struct V4L2MediaReqDescriptor {
++ AVDRMFrameDescriptor drm;
++
++ // Media
++ uint64_t timestamp;
++ struct qent_dst * qe_dst;
++
++ // Decode only - should be NULL by the time we emit the frame
++ struct req_decode_ent decode_ent;
++
++ struct media_request *req;
++ struct qent_src *qe_src;
++
++#if HEVC_CTRLS_VERSION >= 2
++ struct v4l2_ctrl_hevc_decode_params dec;
++#endif
++
++ size_t num_slices;
++ size_t alloced_slices;
++ struct v4l2_ctrl_hevc_slice_params * slice_params;
++ struct slice_info * slices;
++
++ size_t num_offsets;
++ size_t alloced_offsets;
++ uint32_t *offsets;
++
++} V4L2MediaReqDescriptor;
++
++struct slice_info {
++ const uint8_t * ptr;
++ size_t len; // bytes
++ size_t n_offsets;
++};
++
++// Handy container for accumulating controls before setting
++struct req_controls {
++ int has_scaling;
++ struct timeval tv;
++ struct v4l2_ctrl_hevc_sps sps;
++ struct v4l2_ctrl_hevc_pps pps;
++ struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
++};
++
++//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++
++// Get an FFmpeg format from the v4l2 format
++static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
++{
++ switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
++ format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
++ case V4L2_PIX_FMT_YUV420:
++ return AV_PIX_FMT_YUV420P;
++ case V4L2_PIX_FMT_NV12:
++ return AV_PIX_FMT_NV12;
++#if CONFIG_SAND
++ case V4L2_PIX_FMT_NV12_COL128:
++ return AV_PIX_FMT_RPI4_8;
++ case V4L2_PIX_FMT_NV12_10_COL128:
++ return AV_PIX_FMT_RPI4_10;
++#endif
++ default:
++ break;
++ }
++ return AV_PIX_FMT_NONE;
++}
++
++static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
++{
++ const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++ return rd->timestamp;
++}
++
++static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
++{
++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++ rd->timestamp = dpb_stamp;
++}
++
++static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
++{
++ int32_t luma_weight_denom, chroma_weight_denom;
++ const SliceHeader *sh = &h->sh;
++
++ if (sh->slice_type == HEVC_SLICE_I ||
++ (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
++ (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
++ return;
++
++ table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
++
++ if (h->ps.sps->chroma_format_idc)
++ table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
++
++ luma_weight_denom = (1 << sh->luma_log2_weight_denom);
++ chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
++
++ for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
++ table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
++ table->luma_offset_l0[i] = sh->luma_offset_l0[i];
++ table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
++ table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
++ table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
++ table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
++ }
++
++ if (sh->slice_type != HEVC_SLICE_B)
++ return;
++
++ for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
++ table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
++ table->luma_offset_l1[i] = sh->luma_offset_l1[i];
++ table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
++ table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
++ table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
++ table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
++ }
++}
++
++#if HEVC_CTRLS_VERSION <= 2
++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
++{
++ const HEVCFrame *frame;
++ int i;
++
++ for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
++ frame = h->rps[ST_CURR_BEF].ref[i];
++ if (frame && timestamp == frame_capture_dpb(frame->frame))
++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
++ }
++
++ for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
++ frame = h->rps[ST_CURR_AFT].ref[i];
++ if (frame && timestamp == frame_capture_dpb(frame->frame))
++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
++ }
++
++ for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
++ frame = h->rps[LT_CURR].ref[i];
++ if (frame && timestamp == frame_capture_dpb(frame->frame))
++ return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
++ }
++
++ return 0;
++}
++#endif
++
++static unsigned int
++get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
++ const struct v4l2_hevc_dpb_entry * const entries,
++ const unsigned int num_entries)
++{
++ uint64_t timestamp;
++
++ if (!frame)
++ return 0;
++
++ timestamp = frame_capture_dpb(frame->frame);
++
++ for (unsigned int i = 0; i < num_entries; i++) {
++ if (entries[i].timestamp == timestamp)
++ return i;
++ }
++
++ return 0;
++}
++
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++ unsigned int z = 0;
++ while (idx--) {
++ if (*b++ == 0) {
++ ++z;
++ if (z >= 2 && *b == 3) {
++ ++b;
++ z = 0;
++ }
++ }
++ else {
++ z = 0;
++ }
++ }
++ return b;
++}
++
++static int slice_add(V4L2MediaReqDescriptor * const rd)
++{
++ if (rd->num_slices >= rd->alloced_slices) {
++ struct v4l2_ctrl_hevc_slice_params * p2;
++ struct slice_info * s2;
++ size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
++
++ p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
++ if (p2 == NULL)
++ return AVERROR(ENOMEM);
++ rd->slice_params = p2;
++
++ s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
++ if (s2 == NULL)
++ return AVERROR(ENOMEM);
++ rd->slices = s2;
++
++ rd->alloced_slices = n2;
++ }
++ ++rd->num_slices;
++ return 0;
++}
++
++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
++{
++ if (rd->num_offsets + n > rd->alloced_offsets) {
++ size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
++ void * p2;
++ while (rd->num_offsets + n > n2)
++ n2 *= 2;
++ if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
++ return AVERROR(ENOMEM);
++ rd->offsets = p2;
++ rd->alloced_offsets = n2;
++ }
++ for (size_t i = 0; i != n; ++i)
++ rd->offsets[rd->num_offsets++] = offsets[i] - 1;
++ return 0;
++}
++
++static unsigned int
++fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
++{
++ unsigned int i;
++ unsigned int n = 0;
++ const HEVCFrame * const pic = h->ref;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
++ const HEVCFrame * const frame = &h->DPB[i];
++ if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
++ struct v4l2_hevc_dpb_entry * const entry = entries + n++;
++
++ entry->timestamp = frame_capture_dpb(frame->frame);
++#if HEVC_CTRLS_VERSION <= 2
++ entry->rps = find_frame_rps_type(h, entry->timestamp);
++#else
++ entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
++ V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
++#endif
++ entry->field_pic = frame->frame->interlaced_frame;
++
++#if HEVC_CTRLS_VERSION <= 3
++ /* TODO: Interleaved: Get the POC for each field. */
++ entry->pic_order_cnt[0] = frame->poc;
++ entry->pic_order_cnt[1] = frame->poc;
++#else
++ entry->pic_order_cnt_val = frame->poc;
++#endif
++ }
++ }
++ return n;
++}
++
++static void fill_slice_params(const HEVCContext * const h,
++#if HEVC_CTRLS_VERSION >= 2
++ const struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++ struct v4l2_ctrl_hevc_slice_params *slice_params,
++ uint32_t bit_size, uint32_t bit_offset)
++{
++ const SliceHeader * const sh = &h->sh;
++#if HEVC_CTRLS_VERSION >= 2
++ const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
++ const unsigned int dpb_n = dec->num_active_dpb_entries;
++#else
++ struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
++ unsigned int dpb_n;
++#endif
++ unsigned int i;
++ RefPicList *rpl;
++
++ *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
++ .bit_size = bit_size,
++#if HEVC_CTRLS_VERSION <= 3
++ .data_bit_offset = bit_offset,
++#else
++ .data_byte_offset = bit_offset / 8 + 1,
++#endif
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ .slice_segment_addr = sh->slice_segment_addr,
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ .nal_unit_type = h->nal_unit_type,
++ .nuh_temporal_id_plus1 = h->temporal_id + 1,
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ .slice_type = sh->slice_type,
++ .colour_plane_id = sh->colour_plane_id,
++ .slice_pic_order_cnt = h->ref->poc,
++ .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
++ .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
++ .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
++ .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
++ .slice_qp_delta = sh->slice_qp_delta,
++ .slice_cb_qp_offset = sh->slice_cb_qp_offset,
++ .slice_cr_qp_offset = sh->slice_cr_qp_offset,
++ .slice_act_y_qp_offset = 0,
++ .slice_act_cb_qp_offset = 0,
++ .slice_act_cr_qp_offset = 0,
++ .slice_beta_offset_div2 = sh->beta_offset / 2,
++ .slice_tc_offset_div2 = sh->tc_offset / 2,
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ .pic_struct = h->sei.picture_timing.picture_struct,
++
++#if HEVC_CTRLS_VERSION < 2
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++ .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++ .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++#endif
++ };
++
++ if (sh->slice_sample_adaptive_offset_flag[0])
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
++
++ if (sh->slice_sample_adaptive_offset_flag[1])
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
++
++ if (sh->slice_temporal_mvp_enabled_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
++
++ if (sh->mvd_l1_zero_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
++
++ if (sh->cabac_init_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
++
++ if (sh->collocated_list == L0)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
++
++ if (sh->disable_deblocking_filter_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
++
++ if (sh->slice_loop_filter_across_slices_enabled_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++ if (sh->dependent_slice_segment_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
++
++#if HEVC_CTRLS_VERSION < 2
++ dpb_n = fill_dpb_entries(h, dpb);
++ slice_params->num_active_dpb_entries = dpb_n;
++#endif
++
++ if (sh->slice_type != HEVC_SLICE_I) {
++ rpl = &h->ref->refPicList[0];
++ for (i = 0; i < rpl->nb_refs; i++)
++ slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B) {
++ rpl = &h->ref->refPicList[1];
++ for (i = 0; i < rpl->nb_refs; i++)
++ slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++ }
++
++ fill_pred_table(h, &slice_params->pred_weight_table);
++
++ slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++#if HEVC_CTRLS_VERSION <= 3
++ if (slice_params->num_entry_point_offsets > 256) {
++ slice_params->num_entry_point_offsets = 256;
++ av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
++ }
++
++ for (i = 0; i < slice_params->num_entry_point_offsets; i++)
++ slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
++#endif
++}
++
++#if HEVC_CTRLS_VERSION >= 2
++static void
++fill_decode_params(const HEVCContext * const h,
++ struct v4l2_ctrl_hevc_decode_params * const dec)
++{
++ unsigned int i;
++
++ *dec = (struct v4l2_ctrl_hevc_decode_params){
++ .pic_order_cnt_val = h->poc,
++ .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++ .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++ .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++ };
++
++ dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
++
++ // The docn does seem to ask that we fit our 32 bit signed POC into
++ // a U8 so... (To be fair 16 bits would be enough)
++ // Luckily we (Pi) don't use these fields
++ for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
++ dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
++ for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
++ dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
++ for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
++ dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
++
++ if (IS_IRAP(h))
++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
++ if (IS_IDR(h))
++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
++ if (h->sh.no_output_of_prior_pics_flag)
++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
++
++}
++#endif
++
++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
++{
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ *ctrl = (struct v4l2_ctrl_hevc_sps) {
++ .chroma_format_idc = sps->chroma_format_idc,
++ .pic_width_in_luma_samples = sps->width,
++ .pic_height_in_luma_samples = sps->height,
++ .bit_depth_luma_minus8 = sps->bit_depth - 8,
++ .bit_depth_chroma_minus8 = sps->bit_depth - 8,
++ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++ .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
++ .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
++ .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
++ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
++ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
++ .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
++ .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
++ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
++ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
++ .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
++ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
++ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
++ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
++ .num_short_term_ref_pic_sets = sps->nb_st_rps,
++ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
++ .chroma_format_idc = sps->chroma_format_idc,
++ .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
++ };
++
++ if (sps->separate_colour_plane_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++
++ if (sps->scaling_list_enable_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
++
++ if (sps->amp_enabled_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
++
++ if (sps->sao_enabled)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
++
++ if (sps->pcm_enabled_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
++
++ if (sps->pcm.loop_filter_disable_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
++
++ if (sps->long_term_ref_pics_present_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
++
++ if (sps->sps_temporal_mvp_enabled_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
++
++ if (sps->sps_strong_intra_smoothing_enable_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
++}
++
++static void fill_scaling_matrix(const ScalingList * const sl,
++ struct v4l2_ctrl_hevc_scaling_matrix * const sm)
++{
++ unsigned int i;
++
++ for (i = 0; i < 6; i++) {
++ unsigned int j;
++
++ for (j = 0; j < 16; j++)
++ sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
++ for (j = 0; j < 64; j++) {
++ sm->scaling_list_8x8[i][j] = sl->sl[1][i][j];
++ sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
++ if (i < 2)
++ sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
++ }
++ sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
++ if (i < 2)
++ sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
++ }
++}
++
++static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
++{
++ uint64_t flags = 0;
++
++ if (pps->dependent_slice_segments_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
++
++ if (pps->output_flag_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
++
++ if (pps->sign_data_hiding_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
++
++ if (pps->cabac_init_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
++
++ if (pps->constrained_intra_pred_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++
++ if (pps->transform_skip_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
++
++ if (pps->cu_qp_delta_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
++
++ if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
++
++ if (pps->weighted_pred_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
++
++ if (pps->weighted_bipred_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
++
++ if (pps->transquant_bypass_enable_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
++
++ if (pps->tiles_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
++
++ if (pps->entropy_coding_sync_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
++
++ if (pps->loop_filter_across_tiles_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
++
++ if (pps->seq_loop_filter_across_slices_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++ if (pps->deblocking_filter_override_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
++
++ if (pps->disable_dbf)
++ flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
++
++ if (pps->lists_modification_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
++
++ if (pps->slice_header_extension_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ *ctrl = (struct v4l2_ctrl_hevc_pps) {
++ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
++ .init_qp_minus26 = pps->pic_init_qp_minus26,
++ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
++ .pps_cb_qp_offset = pps->cb_qp_offset,
++ .pps_cr_qp_offset = pps->cr_qp_offset,
++ .pps_beta_offset_div2 = pps->beta_offset / 2,
++ .pps_tc_offset_div2 = pps->tc_offset / 2,
++ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
++ .flags = flags
++ };
++
++
++ if (pps->tiles_enabled_flag) {
++ ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
++ ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
++
++ for (int i = 0; i < pps->num_tile_columns; i++)
++ ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
++
++ for (int i = 0; i < pps->num_tile_rows; i++)
++ ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
++ }
++}
++
++// Called before finally returning the frame to the user
++// Set corrupt flag here as this is actually the frame structure that
++// is going to the user (in MT land each thread has its own pool)
++static int frame_post_process(void *logctx, AVFrame *frame)
++{
++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
++
++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++ frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
++ if (rd->qe_dst) {
++ MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
++ if (stat != MEDIABUFS_STATUS_SUCCESS) {
++ av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
++ frame->flags |= AV_FRAME_FLAG_CORRUPT;
++ }
++ }
++
++ return 0;
++}
++
++static inline struct timeval cvt_dpb_to_tv(uint64_t t)
++{
++ t /= 1000;
++ return (struct timeval){
++ .tv_usec = t % 1000000,
++ .tv_sec = t / 1000000
++ };
++}
++
++static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
++{
++ return (uint64_t)t * 1000;
++}
++
++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
++ av_unused const uint8_t *buffer,
++ av_unused uint32_t size)
++{
++ const HEVCContext *h = avctx->priv_data;
++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++ decode_q_add(&ctx->decode_q, &rd->decode_ent);
++
++ rd->num_slices = 0;
++ ctx->timestamp++;
++ rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
++
++ {
++ FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
++ fdd->post_process = frame_post_process;
++ }
++
++ // qe_dst needs to be bound to the data buffer and only returned when that is
++ if (!rd->qe_dst)
++ {
++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++ return 0;
++}
++
++// Object fd & size will be zapped by this & need setting later
++static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
++{
++ AVDRMLayerDescriptor *layer = &desc->layers[0];
++ unsigned int width;
++ unsigned int height;
++ unsigned int bpl;
++ uint32_t pixelformat;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++ width = format->fmt.pix_mp.width;
++ height = format->fmt.pix_mp.height;
++ pixelformat = format->fmt.pix_mp.pixelformat;
++ bpl = format->fmt.pix_mp.plane_fmt[0].bytesperline;
++ }
++ else {
++ width = format->fmt.pix.width;
++ height = format->fmt.pix.height;
++ pixelformat = format->fmt.pix.pixelformat;
++ bpl = format->fmt.pix.bytesperline;
++ }
++
++ switch (pixelformat) {
++ case V4L2_PIX_FMT_NV12:
++ layer->format = DRM_FORMAT_NV12;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#if CONFIG_SAND
++ case V4L2_PIX_FMT_NV12_COL128:
++ layer->format = DRM_FORMAT_NV12;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++ break;
++ case V4L2_PIX_FMT_NV12_10_COL128:
++ layer->format = DRM_FORMAT_P030;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++ break;
++#endif
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++ case V4L2_PIX_FMT_SUNXI_TILED_NV12:
++ layer->format = DRM_FORMAT_NV12;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
++ break;
++#endif
++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
++ case V4L2_PIX_FMT_NV15:
++ layer->format = DRM_FORMAT_NV15;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#endif
++ case V4L2_PIX_FMT_NV16:
++ layer->format = DRM_FORMAT_NV16;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
++ case V4L2_PIX_FMT_NV20:
++ layer->format = DRM_FORMAT_NV20;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#endif
++ default:
++ return -1;
++ }
++
++ desc->nb_objects = 1;
++ desc->objects[0].fd = -1;
++ desc->objects[0].size = 0;
++
++ desc->nb_layers = 1;
++ layer->nb_planes = 2;
++
++ layer->planes[0].object_index = 0;
++ layer->planes[0].offset = 0;
++ layer->planes[0].pitch = bpl;
++#if CONFIG_SAND
++ if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = height * 128;
++ layer->planes[0].pitch = width;
++ layer->planes[1].pitch = width;
++ }
++ else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = height * 128;
++ layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
++ layer->planes[1].pitch = width * 2;
++ }
++ else
++#endif
++ {
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = layer->planes[0].pitch * height;
++ layer->planes[1].pitch = layer->planes[0].pitch;
++ }
++
++ return 0;
++}
++
++static int
++set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
++ struct req_controls *const controls,
++#if HEVC_CTRLS_VERSION >= 2
++ struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++ struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
++ void * const offsets, const size_t offset_count)
++{
++ int rv;
++#if HEVC_CTRLS_VERSION >= 2
++ unsigned int n = 3;
++#else
++ unsigned int n = 2;
++#endif
++
++ struct v4l2_ext_control control[6] = {
++ {
++ .id = V4L2_CID_STATELESS_HEVC_SPS,
++ .ptr = &controls->sps,
++ .size = sizeof(controls->sps),
++ },
++ {
++ .id = V4L2_CID_STATELESS_HEVC_PPS,
++ .ptr = &controls->pps,
++ .size = sizeof(controls->pps),
++ },
++#if HEVC_CTRLS_VERSION >= 2
++ {
++ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
++ .ptr = dec,
++ .size = sizeof(*dec),
++ },
++#endif
++ };
++
++ if (slices)
++ control[n++] = (struct v4l2_ext_control) {
++ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
++ .ptr = slices,
++ .size = sizeof(*slices) * slice_count,
++ };
++
++ if (controls->has_scaling)
++ control[n++] = (struct v4l2_ext_control) {
++ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
++ .ptr = &controls->scaling_matrix,
++ .size = sizeof(controls->scaling_matrix),
++ };
++
++#if HEVC_CTRLS_VERSION >= 4
++ if (offsets)
++ control[n++] = (struct v4l2_ext_control) {
++ .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
++ .ptr = offsets,
++ .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
++ };
++#endif
++
++ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
++
++ return rv;
++}
++
++// This only works because we started out from a single coded frame buffer
++// that will remain intact until after end_frame
++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++ const HEVCContext * const h = avctx->priv_data;
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++ int bcount = get_bits_count(&h->HEVClc->gb);
++ uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
++
++ const unsigned int n = rd->num_slices;
++ const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
++
++ int rv;
++ struct slice_info * si;
++
++ // This looks dodgy but we know that FFmpeg has parsed this from a buffer
++ // that contains the entire frame including the start code
++ if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
++ buffer -= 3;
++ size += 3;
++ boff += 24;
++ if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
++ av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
++ buffer[0], buffer[1], buffer[2]);
++ }
++ }
++
++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
++ if (rd->slices == NULL) {
++ if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
++ return AVERROR(ENOMEM);
++ rd->slices->ptr = buffer;
++ rd->num_slices = 1;
++ }
++ rd->slices->len = buffer - rd->slices->ptr + size;
++ return 0;
++ }
++
++ if ((rv = slice_add(rd)) != 0)
++ return rv;
++
++ si = rd->slices + n;
++ si->ptr = buffer;
++ si->len = size;
++ si->n_offsets = rd->num_offsets;
++
++ if (n != block_start) {
++ struct slice_info *const si0 = rd->slices + block_start;
++ const size_t offset = (buffer - si0->ptr);
++ boff += offset * 8;
++ size += offset;
++ si0->len = si->len + offset;
++ }
++
++#if HEVC_CTRLS_VERSION >= 2
++ if (n == 0)
++ fill_decode_params(h, &rd->dec);
++ fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
++#else
++ fill_slice_params(h, rd->slice_params + n, size * 8, boff);
++#endif
++ if (ctx->max_offsets != 0 &&
++ (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
++ return rv;
++
++ return 0;
++}
++
++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
++{
++ const HEVCContext * const h = avctx->priv_data;
++ if (h->ref != NULL) {
++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++ media_request_abort(&rd->req);
++ mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
++
++ decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++ }
++}
++
++static int send_slice(AVCodecContext * const avctx,
++ V4L2MediaReqDescriptor * const rd,
++ struct req_controls *const controls,
++ const unsigned int i, const unsigned int j)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++ const int is_last = (j == rd->num_slices);
++ struct slice_info *const si = rd->slices + i;
++ struct media_request * req = NULL;
++ struct qent_src * src = NULL;
++ MediaBufsStatus stat;
++ void * offsets = rd->offsets + rd->slices[i].n_offsets;
++ size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
++
++ if ((req = media_request_get(ctx->mpool)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
++ return AVERROR(ENOMEM);
++ }
++
++ if (set_req_ctls(ctx, req,
++ controls,
++#if HEVC_CTRLS_VERSION >= 2
++ &rd->dec,
++#endif
++ rd->slice_params + i, j - i,
++ offsets, n_offsets)) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
++ goto fail1;
++ }
++
++ if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
++ goto fail1;
++ }
++
++ if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
++ goto fail2;
++ }
++
++ if (qent_src_params_set(src, &controls->tv)) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
++ goto fail2;
++ }
++
++ stat = mediabufs_start_request(ctx->mbufs, &req, &src,
++ i == 0 ? rd->qe_dst : NULL,
++ is_last);
++
++ if (stat != MEDIABUFS_STATUS_SUCCESS) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
++ return AVERROR_UNKNOWN;
++ }
++ return 0;
++
++fail2:
++ mediabufs_src_qent_abort(ctx->mbufs, &src);
++fail1:
++ media_request_abort(&req);
++ return AVERROR_UNKNOWN;
++}
++
++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
++{
++ const HEVCContext * const h = avctx->priv_data;
++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++ struct req_controls rc;
++ unsigned int i;
++ int rv;
++
++ // It is possible, though maybe a bug, to get an end_frame without
++ // a previous start_frame. If we do then give up.
++ if (!decode_q_in_q(&rd->decode_ent)) {
++ av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
++ return AVERROR_INVALIDDATA;
++ }
++
++ {
++ const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
++ &h->ps.pps->scaling_list :
++ h->ps.sps->scaling_list_enable_flag ?
++ &h->ps.sps->scaling_list : NULL;
++
++
++ memset(&rc, 0, sizeof(rc));
++ rc.tv = cvt_dpb_to_tv(rd->timestamp);
++ fill_sps(&rc.sps, h->ps.sps);
++ fill_pps(&rc.pps, h->ps.pps);
++ if (sl) {
++ rc.has_scaling = 1;
++ fill_scaling_matrix(sl, &rc.scaling_matrix);
++ }
++ }
++
++ decode_q_wait(&ctx->decode_q, &rd->decode_ent);
++
++ // qe_dst needs to be bound to the data buffer and only returned when that is
++ // Alloc almost certainly wants to be serialised if there is any chance of blocking
++ // so we get the next frame to be free in the thread that needs it for decode first.
++ //
++ // In our current world this probably isn't a concern but put it here anyway
++ if (!rd->qe_dst)
++ {
++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++ rv = AVERROR(ENOMEM);
++ goto fail;
++ }
++ }
++
++ // Send as slices
++ for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
++ const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
++ if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
++ goto fail;
++ }
++
++ // Set the drm_prime desriptor
++ drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
++ rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
++ rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
++
++ decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++ return 0;
++
++fail:
++ decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++ return rv;
++}
++
++static inline int
++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
++{
++ return v >= c->minimum && v <= c->maximum;
++}
++
++// Initial check & init
++static int
++probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++ const HEVCContext *h = avctx->priv_data;
++ const HEVCSPS * const sps = h->ps.sps;
++ struct v4l2_ctrl_hevc_sps ctrl_sps;
++ unsigned int i;
++
++ // Check for var slice array
++ struct v4l2_query_ext_ctrl qc[] = {
++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_SPS },
++ { .id = V4L2_CID_STATELESS_HEVC_PPS },
++ { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
++#if HEVC_CTRLS_VERSION >= 2
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
++#endif
++ };
++ // Order & size must match!
++ static const size_t ctrl_sizes[] = {
++ sizeof(struct v4l2_ctrl_hevc_slice_params),
++ sizeof(int32_t),
++ sizeof(struct v4l2_ctrl_hevc_sps),
++ sizeof(struct v4l2_ctrl_hevc_pps),
++ sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
++#if HEVC_CTRLS_VERSION >= 2
++ sizeof(struct v4l2_ctrl_hevc_decode_params),
++#endif
++ };
++ const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
++
++#if HEVC_CTRLS_VERSION == 2
++ if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++ return AVERROR(EINVAL);
++#elif HEVC_CTRLS_VERSION == 3
++ if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++ return AVERROR(EINVAL);
++#endif
++
++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
++ i = 0;
++#if HEVC_CTRLS_VERSION >= 4
++ // Skip slice check if no slice mode
++ if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++ i = 1;
++#else
++ // Fail frame mode silently for anything prior to V4
++ if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++ return AVERROR(EINVAL);
++#endif
++ for (; i != noof_ctrls; ++i) {
++ if (qc[i].type == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
++ return AVERROR(EINVAL);
++ }
++ if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
++ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
++ return AVERROR(EINVAL);
++ }
++ }
++
++ fill_sps(&ctrl_sps, sps);
++
++ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
++ return AVERROR(EINVAL);
++ }
++
++ return 0;
++}
++
++// Final init
++static int
++set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++ int ret;
++
++ struct v4l2_query_ext_ctrl querys[] = {
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
++#if HEVC_CTRLS_VERSION >= 4
++ { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
++#endif
++ };
++
++ struct v4l2_ext_control ctrls[] = {
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++ };
++
++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
++
++ ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
++ querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
++ 1 : querys[2].dims[0];
++ av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
++
++#if HEVC_CTRLS_VERSION >= 4
++ ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
++ 0 : querys[3].dims[0];
++ av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
++#else
++ ctx->max_offsets = 0;
++#endif
++
++ if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
++ querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
++ ctx->decode_mode = querys[0].default_value;
++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
++ else {
++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
++ return AVERROR(EINVAL);
++ }
++
++ if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
++ querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
++ ctx->start_code = querys[1].default_value;
++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++ else {
++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
++ return AVERROR(EINVAL);
++ }
++
++ // If we are in slice mode & START_CODE_NONE supported then pick that
++ // as it doesn't require the slightly dodgy look backwards in our raw buffer
++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
++ ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++
++ ctrls[0].value = ctx->decode_mode;
++ ctrls[1].value = ctx->start_code;
++
++ ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
++ return !ret ? 0 : AVERROR(-ret);
++}
++
++static void v4l2_req_frame_free(void *opaque, uint8_t *data)
++{
++ AVCodecContext *avctx = opaque;
++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
++
++ av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
++
++ qent_dst_unref(&rd->qe_dst);
++
++ // We don't expect req or qe_src to be set
++ if (rd->req || rd->qe_src)
++ av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
++
++ av_freep(&rd->slices);
++ av_freep(&rd->slice_params);
++ av_freep(&rd->offsets);
++
++ av_free(rd);
++}
++
++static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
++{
++ AVCodecContext *avctx = opaque;
++// V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++// V4L2MediaReqDescriptor *req;
++ AVBufferRef *ref;
++ uint8_t *data;
++// int ret;
++
++ data = av_mallocz(size);
++ if (!data)
++ return NULL;
++
++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
++ ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
++ if (!ref) {
++ av_freep(&data);
++ return NULL;
++ }
++ return ref;
++}
++
++#if 0
++static void v4l2_req_pool_free(void *opaque)
++{
++ av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
++}
++
++static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
++{
++ av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
++
++ av_buffer_pool_uninit(&hwfc->pool);
++}
++#endif
++
++static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++ AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
++ const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
++
++ hwfc->format = AV_PIX_FMT_DRM_PRIME;
++ hwfc->sw_format = pixel_format_from_format(vfmt);
++ if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
++ hwfc->width = vfmt->fmt.pix_mp.width;
++ hwfc->height = vfmt->fmt.pix_mp.height;
++ } else {
++ hwfc->width = vfmt->fmt.pix.width;
++ hwfc->height = vfmt->fmt.pix.height;
++ }
++#if 0
++ hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
++ if (!hwfc->pool)
++ return AVERROR(ENOMEM);
++
++ hwfc->free = v4l2_req_hwframe_ctx_free;
++
++ hwfc->initial_pool_size = 1;
++
++ switch (avctx->codec_id) {
++ case AV_CODEC_ID_VP9:
++ hwfc->initial_pool_size += 8;
++ break;
++ case AV_CODEC_ID_VP8:
++ hwfc->initial_pool_size += 3;
++ break;
++ default:
++ hwfc->initial_pool_size += 2;
++ }
++#endif
++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
++
++ return 0;
++}
++
++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++ int rv;
++
++ frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
++ if (!frame->buf[0])
++ return AVERROR(ENOMEM);
++
++ frame->data[0] = frame->buf[0]->data;
++
++ frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
++
++ if ((rv = ff_attach_decode_data(frame)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
++ av_frame_unref(frame);
++ return rv;
++ }
++
++ return 0;
++}
++
++const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
++ .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
++ .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
++ .probe = probe,
++ .set_controls = set_controls,
++
++ .start_frame = v4l2_request_hevc_start_frame,
++ .decode_slice = v4l2_request_hevc_decode_slice,
++ .end_frame = v4l2_request_hevc_end_frame,
++ .abort_frame = v4l2_request_hevc_abort_frame,
++ .frame_params = frame_params,
++ .alloc_frame = alloc_frame,
++};
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.c
+@@ -0,0 +1,1601 @@
++/*
++ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <errno.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++#include <linux/media.h>
++#include <sys/ioctl.h>
++#include <sys/select.h>
++#include <sys/ioctl.h>
++
++#include <linux/videodev2.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++#include "weak_link.h"
++
++
++/* floor(log2(x)) */
++static unsigned int log2_size(size_t x)
++{
++ unsigned int n = 0;
++
++ if (x & ~0xffff) {
++ n += 16;
++ x >>= 16;
++ }
++ if (x & ~0xff) {
++ n += 8;
++ x >>= 8;
++ }
++ if (x & ~0xf) {
++ n += 4;
++ x >>= 4;
++ }
++ if (x & ~3) {
++ n += 2;
++ x >>= 2;
++ }
++ return (x & ~1) ? n + 1 : n;
++}
++
++static size_t round_up_size(const size_t x)
++{
++ /* Admit no size < 256 */
++ const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
++
++ return x >= (3 << n) ? 4 << n : (3 << n);
++}
++
++struct media_request;
++
++struct media_pool {
++ int fd;
++ sem_t sem;
++ pthread_mutex_t lock;
++ struct media_request * free_reqs;
++ struct pollqueue * pq;
++};
++
++struct media_request {
++ struct media_request * next;
++ struct media_pool * mp;
++ int fd;
++ struct polltask * pt;
++};
++
++
++static inline int do_trywait(sem_t *const sem)
++{
++ while (sem_trywait(sem)) {
++ if (errno != EINTR)
++ return -errno;
++ }
++ return 0;
++}
++
++static inline int do_wait(sem_t *const sem)
++{
++ while (sem_wait(sem)) {
++ if (errno != EINTR)
++ return -errno;
++ }
++ return 0;
++}
++
++static int request_buffers(int video_fd, unsigned int type,
++ enum v4l2_memory memory, unsigned int buffers_count)
++{
++ struct v4l2_requestbuffers buffers;
++ int rc;
++
++ memset(&buffers, 0, sizeof(buffers));
++ buffers.type = type;
++ buffers.memory = memory;
++ buffers.count = buffers_count;
++
++ rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
++ if (rc < 0) {
++ rc = -errno;
++ request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
++ return rc;
++ }
++
++ return 0;
++}
++
++
++static int set_stream(int video_fd, unsigned int type, bool enable)
++{
++ enum v4l2_buf_type buf_type = type;
++ int rc;
++
++ rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
++ &buf_type);
++ if (rc < 0) {
++ rc = -errno;
++ request_log("Unable to %sable stream: %s\n",
++ enable ? "en" : "dis", strerror(-rc));
++ return rc;
++ }
++
++ return 0;
++}
++
++
++
++struct media_request * media_request_get(struct media_pool * const mp)
++{
++ struct media_request *req = NULL;
++
++ /* Timeout handled by poll code */
++ if (do_wait(&mp->sem))
++ return NULL;
++
++ pthread_mutex_lock(&mp->lock);
++ req = mp->free_reqs;
++ if (req) {
++ mp->free_reqs = req->next;
++ req->next = NULL;
++ }
++ pthread_mutex_unlock(&mp->lock);
++ return req;
++}
++
++int media_request_fd(const struct media_request * const req)
++{
++ return req->fd;
++}
++
++int media_request_start(struct media_request * const req)
++{
++ while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
++ {
++ const int err = errno;
++ if (err == EINTR)
++ continue;
++ request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
++ return -err;
++ }
++
++ pollqueue_add_task(req->pt, 2000);
++ return 0;
++}
++
++static void media_request_done(void *v, short revents)
++{
++ struct media_request *const req = v;
++ struct media_pool *const mp = req->mp;
++
++ /* ** Not sure what to do about timeout */
++
++ if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
++ request_log("Unable to reinit media request: %s\n",
++ strerror(errno));
++
++ pthread_mutex_lock(&mp->lock);
++ req->next = mp->free_reqs;
++ mp->free_reqs = req;
++ pthread_mutex_unlock(&mp->lock);
++ sem_post(&mp->sem);
++}
++
++int media_request_abort(struct media_request ** const preq)
++{
++ struct media_request * const req = *preq;
++
++ if (req == NULL)
++ return 0;
++ *preq = NULL;
++
++ media_request_done(req, 0);
++ return 0;
++}
++
++static void delete_req_chain(struct media_request * const chain)
++{
++ struct media_request * next = chain;
++ while (next) {
++ struct media_request * const req = next;
++ next = req->next;
++ if (req->pt)
++ polltask_delete(&req->pt);
++ if (req->fd != -1)
++ close(req->fd);
++ free(req);
++ }
++}
++
++struct media_pool * media_pool_new(const char * const media_path,
++ struct pollqueue * const pq,
++ const unsigned int n)
++{
++ struct media_pool * const mp = calloc(1, sizeof(*mp));
++ unsigned int i;
++
++ if (!mp)
++ goto fail0;
++
++ mp->pq = pq;
++ pthread_mutex_init(&mp->lock, NULL);
++ mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
++ if (mp->fd == -1) {
++ request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
++ goto fail1;
++ }
++
++ for (i = 0; i != n; ++i) {
++ struct media_request * req = malloc(sizeof(*req));
++ if (!req)
++ goto fail4;
++
++ *req = (struct media_request){
++ .next = mp->free_reqs,
++ .mp = mp,
++ .fd = -1
++ };
++ mp->free_reqs = req;
++
++ if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
++ request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
++ goto fail4;
++ }
++
++ req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
++ if (!req->pt)
++ goto fail4;
++ }
++
++ sem_init(&mp->sem, 0, n);
++
++ return mp;
++
++fail4:
++ delete_req_chain(mp->free_reqs);
++ close(mp->fd);
++ pthread_mutex_destroy(&mp->lock);
++fail1:
++ free(mp);
++fail0:
++ return NULL;
++}
++
++void media_pool_delete(struct media_pool ** pMp)
++{
++ struct media_pool * const mp = *pMp;
++
++ if (!mp)
++ return;
++ *pMp = NULL;
++
++ delete_req_chain(mp->free_reqs);
++ close(mp->fd);
++ sem_destroy(&mp->sem);
++ pthread_mutex_destroy(&mp->lock);
++ free(mp);
++}
++
++
++#define INDEX_UNSET (~(uint32_t)0)
++
++enum qent_status {
++ QENT_NEW = 0, // Initial state - shouldn't last
++ QENT_FREE, // On free chain
++ QENT_PENDING, // User has ent
++ QENT_WAITING, // On inuse
++ QENT_DONE, // Frame rx
++ QENT_ERROR, // Error
++ QENT_IMPORT
++};
++
++struct qent_base {
++ atomic_int ref_count;
++ struct qent_base *next;
++ struct qent_base *prev;
++ enum qent_status status;
++ uint32_t index;
++ struct dmabuf_h *dh[VIDEO_MAX_PLANES];
++ struct timeval timestamp;
++};
++
++struct qent_src {
++ struct qent_base base;
++ int fixed_size;
++};
++
++struct qent_dst {
++ struct qent_base base;
++ bool waiting;
++ pthread_mutex_t lock;
++ pthread_cond_t cond;
++ struct ff_weak_link_client * mbc_wl;
++};
++
++struct qe_list_head {
++ struct qent_base *head;
++ struct qent_base *tail;
++};
++
++struct buf_pool {
++ pthread_mutex_t lock;
++ sem_t free_sem;
++ enum v4l2_buf_type buf_type;
++ struct qe_list_head free;
++ struct qe_list_head inuse;
++};
++
++
++static inline struct qent_dst *base_to_dst(struct qent_base *be)
++{
++ return (struct qent_dst *)be;
++}
++
++static inline struct qent_src *base_to_src(struct qent_base *be)
++{
++ return (struct qent_src *)be;
++}
++
++
++#define QENT_BASE_INITIALIZER {\
++ .ref_count = ATOMIC_VAR_INIT(0),\
++ .status = QENT_NEW,\
++ .index = INDEX_UNSET\
++}
++
++static void qe_base_uninit(struct qent_base *const be)
++{
++ unsigned int i;
++ for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
++ dmabuf_free(be->dh[i]);
++ be->dh[i] = NULL;
++ }
++}
++
++static void qe_src_free(struct qent_src *const be_src)
++{
++ if (!be_src)
++ return;
++ qe_base_uninit(&be_src->base);
++ free(be_src);
++}
++
++static struct qent_src * qe_src_new(void)
++{
++ struct qent_src *const be_src = malloc(sizeof(*be_src));
++ if (!be_src)
++ return NULL;
++ *be_src = (struct qent_src){
++ .base = QENT_BASE_INITIALIZER
++ };
++ return be_src;
++}
++
++static void qe_dst_free(struct qent_dst *const be_dst)
++{
++ if (!be_dst)
++ return;
++
++ ff_weak_link_unref(&be_dst->mbc_wl);
++ pthread_cond_destroy(&be_dst->cond);
++ pthread_mutex_destroy(&be_dst->lock);
++ qe_base_uninit(&be_dst->base);
++ free(be_dst);
++}
++
++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
++{
++ struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
++ if (!be_dst)
++ return NULL;
++ *be_dst = (struct qent_dst){
++ .base = QENT_BASE_INITIALIZER,
++ .lock = PTHREAD_MUTEX_INITIALIZER,
++ .cond = PTHREAD_COND_INITIALIZER,
++ .mbc_wl = ff_weak_link_ref(wl)
++ };
++ return be_dst;
++}
++
++static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
++{
++ if (ql->tail)
++ ql->tail->next = be;
++ else
++ ql->head = be;
++ be->prev = ql->tail;
++ be->next = NULL;
++ ql->tail = be;
++}
++
++static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
++{
++ if (!be)
++ return NULL;
++
++ if (be->next)
++ be->next->prev = be->prev;
++ else
++ ql->tail = be->prev;
++ if (be->prev)
++ be->prev->next = be->next;
++ else
++ ql->head = be->next;
++ be->next = NULL;
++ be->prev = NULL;
++ return be;
++}
++
++
++static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
++{
++ ql_add_tail(&bp->free, be);
++}
++
++static struct qent_base * bq_get_free(struct buf_pool *const bp)
++{
++ return ql_extract(&bp->free, bp->free.head);
++}
++
++static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
++{
++ return ql_extract(&bp->inuse, be);
++}
++
++static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
++{
++ return ql_extract(&bp->inuse, bp->inuse.head);
++}
++
++static void bq_free_all_free_src(struct buf_pool *const bp)
++{
++ struct qent_base *be;
++ while ((be = bq_get_free(bp)) != NULL)
++ qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_inuse_src(struct buf_pool *const bp)
++{
++ struct qent_base *be;
++ while ((be = bq_get_inuse(bp)) != NULL)
++ qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_free_dst(struct buf_pool *const bp)
++{
++ struct qent_base *be;
++ while ((be = bq_get_free(bp)) != NULL)
++ qe_dst_free(base_to_dst(be));
++}
++
++static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
++{
++ unsigned int i;
++
++ pthread_mutex_lock(&bp->lock);
++ /* Clear out state vars */
++ be->timestamp.tv_sec = 0;
++ be->timestamp.tv_usec = 0;
++ be->status = QENT_FREE;
++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
++ dmabuf_len_set(be->dh[i], 0);
++ bq_put_free(bp, be);
++ pthread_mutex_unlock(&bp->lock);
++ sem_post(&bp->free_sem);
++}
++
++static bool queue_is_inuse(const struct buf_pool *const bp)
++{
++ return bp->inuse.tail != NULL;
++}
++
++static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
++{
++ if (!be)
++ return;
++ pthread_mutex_lock(&bp->lock);
++ ql_add_tail(&bp->inuse, be);
++ be->status = QENT_WAITING;
++ pthread_mutex_unlock(&bp->lock);
++}
++
++static struct qent_base *queue_get_free(struct buf_pool *const bp)
++{
++ struct qent_base *buf;
++
++ if (do_wait(&bp->free_sem))
++ return NULL;
++ pthread_mutex_lock(&bp->lock);
++ buf = bq_get_free(bp);
++ pthread_mutex_unlock(&bp->lock);
++ return buf;
++}
++
++static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
++{
++ struct qent_base *buf;
++
++ if (do_trywait(&bp->free_sem))
++ return NULL;
++ pthread_mutex_lock(&bp->lock);
++ buf = bq_get_free(bp);
++ pthread_mutex_unlock(&bp->lock);
++ return buf;
++}
++
++static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
++{
++ struct qent_base *be;
++
++ pthread_mutex_lock(&bp->lock);
++ /* Expect 1st in Q, but allow anywhere */
++ for (be = bp->inuse.head; be; be = be->next) {
++ if (dmabuf_fd(be->dh[0]) == fd) {
++ bq_extract_inuse(bp, be);
++ break;
++ }
++ }
++ pthread_mutex_unlock(&bp->lock);
++
++ return be;
++}
++
++static void queue_delete(struct buf_pool *const bp)
++{
++ sem_destroy(&bp->free_sem);
++ pthread_mutex_destroy(&bp->lock);
++ free(bp);
++}
++
++static struct buf_pool* queue_new(const int vfd)
++{
++ struct buf_pool *bp = calloc(1, sizeof(*bp));
++ if (!bp)
++ return NULL;
++ pthread_mutex_init(&bp->lock, NULL);
++ sem_init(&bp->free_sem, 0, 0);
++ return bp;
++}
++
++
++struct mediabufs_ctl {
++ atomic_int ref_count; /* 0 is single ref for easier atomics */
++ void * dc;
++ int vfd;
++ bool stream_on;
++ bool polling;
++ bool dst_fixed; // Dst Q is fixed size
++ pthread_mutex_t lock;
++ struct buf_pool * src;
++ struct buf_pool * dst;
++ struct polltask * pt;
++ struct pollqueue * pq;
++ struct ff_weak_link_master * this_wlm;
++
++ struct v4l2_format src_fmt;
++ struct v4l2_format dst_fmt;
++ struct v4l2_capability capability;
++};
++
++static int qe_v4l2_queue(struct qent_base *const be,
++ const int vfd, struct media_request *const mreq,
++ const struct v4l2_format *const fmt,
++ const bool is_dst, const bool hold_flag)
++{
++ struct v4l2_buffer buffer = {
++ .type = fmt->type,
++ .memory = V4L2_MEMORY_DMABUF,
++ .index = be->index
++ };
++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ unsigned int i;
++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++ if (is_dst)
++ dmabuf_len_set(be->dh[i], 0);
++
++ /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
++ planes[i].length = dmabuf_size(be->dh[i]);
++ planes[i].bytesused = dmabuf_len(be->dh[i]);
++ planes[i].m.fd = dmabuf_fd(be->dh[i]);
++ }
++ buffer.m.planes = planes;
++ buffer.length = i;
++ }
++ else {
++ if (is_dst)
++ dmabuf_len_set(be->dh[0], 0);
++
++ buffer.bytesused = dmabuf_len(be->dh[0]);
++ buffer.length = dmabuf_size(be->dh[0]);
++ buffer.m.fd = dmabuf_fd(be->dh[0]);
++ }
++
++ if (!is_dst && mreq) {
++ buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
++ buffer.request_fd = media_request_fd(mreq);
++ if (hold_flag)
++ buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
++ }
++
++ if (is_dst)
++ be->timestamp = (struct timeval){0,0};
++
++ buffer.timestamp = be->timestamp;
++
++ while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
++ const int err = errno;
++ if (err != EINTR) {
++ request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
++ return -err;
++ }
++ }
++ return 0;
++}
++
++static struct qent_base * qe_dequeue(struct buf_pool *const bp,
++ const int vfd,
++ const struct v4l2_format * const f)
++{
++ int fd;
++ struct qent_base *be;
++ int rc;
++ const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++ struct v4l2_buffer buffer = {
++ .type = f->type,
++ .memory = V4L2_MEMORY_DMABUF
++ };
++ if (mp) {
++ buffer.length = f->fmt.pix_mp.num_planes;
++ buffer.m.planes = planes;
++ }
++
++ while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
++ errno == EINTR)
++ /* Loop */;
++ if (rc) {
++ request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
++ return NULL;
++ }
++
++ fd = mp ? planes[0].m.fd : buffer.m.fd;
++ be = queue_find_extract_fd(bp, fd);
++ if (!be) {
++ request_log("Failed to find fd %d in Q\n", fd);
++ return NULL;
++ }
++
++ be->timestamp = buffer.timestamp;
++ be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
++ return be;
++}
++
++static void qe_dst_done(struct qent_dst * dst_be)
++{
++ pthread_mutex_lock(&dst_be->lock);
++ dst_be->waiting = false;
++ pthread_cond_broadcast(&dst_be->cond);
++ pthread_mutex_unlock(&dst_be->lock);
++
++ qent_dst_unref(&dst_be);
++}
++
++static bool qe_dst_waiting(struct qent_dst *const dst_be)
++{
++ bool waiting;
++ pthread_mutex_lock(&dst_be->lock);
++ waiting = dst_be->waiting;
++ dst_be->waiting = true;
++ pthread_mutex_unlock(&dst_be->lock);
++ return waiting;
++}
++
++
++static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
++{
++ return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
++}
++
++static void mediabufs_poll_cb(void * v, short revents)
++{
++ struct mediabufs_ctl *mbc = v;
++ struct qent_src *src_be = NULL;
++ struct qent_dst *dst_be = NULL;
++
++ if (!revents)
++ request_err(mbc->dc, "%s: Timeout\n", __func__);
++
++ pthread_mutex_lock(&mbc->lock);
++ mbc->polling = false;
++
++ if ((revents & POLLOUT) != 0)
++ src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
++ if ((revents & POLLIN) != 0)
++ dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
++
++ /* Reschedule */
++ if (mediabufs_wants_poll(mbc)) {
++ mbc->polling = true;
++ pollqueue_add_task(mbc->pt, 2000);
++ }
++ pthread_mutex_unlock(&mbc->lock);
++
++ if (src_be)
++ queue_put_free(mbc->src, &src_be->base);
++ if (dst_be)
++ qe_dst_done(dst_be);
++}
++
++int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
++{
++ struct qent_base *const be = &be_src->base;
++
++ be->timestamp = *timestamp;
++ return 0;
++}
++
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
++{
++ return be_dst->base.timestamp;
++}
++
++static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
++{
++ if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
++ size_t newsize = round_up_size(len);
++ request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
++ if (!dbsc) {
++ request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
++ return -ENOMEM;
++ }
++ if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
++ request_log("%s: Realloc %zd failed\n", __func__, newsize);
++ return -ENOMEM;
++ }
++ }
++ return 0;
++}
++
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++ struct qent_base *const be = &be_src->base;
++ return qent_base_realloc(be, len, dbsc);
++}
++
++
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++ void * dst;
++ struct qent_base *const be = &be_src->base;
++ int rv;
++
++ // Realloc doesn't copy so don't alloc if offset != 0
++ if ((rv = qent_base_realloc(be, offset + len,
++ be_src->fixed_size || offset ? NULL : dbsc)) != 0)
++ return rv;
++
++ dmabuf_write_start(be->dh[0]);
++ dst = dmabuf_map(be->dh[0]);
++ if (!dst)
++ return -1;
++ memcpy((char*)dst + offset, src, len);
++ dmabuf_len_set(be->dh[0], len);
++ dmabuf_write_end(be->dh[0]);
++ return 0;
++}
++
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
++{
++ const struct qent_base *const be = &be_dst->base;
++
++ return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
++}
++
++int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
++{
++ return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
++}
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++ struct media_request **const pmreq,
++ struct qent_src **const psrc_be,
++ struct qent_dst *const dst_be,
++ const bool is_final)
++{
++ struct media_request * mreq = *pmreq;
++ struct qent_src *const src_be = *psrc_be;
++
++ // Req & src are always both "consumed"
++ *pmreq = NULL;
++ *psrc_be = NULL;
++
++ pthread_mutex_lock(&mbc->lock);
++
++ if (!src_be)
++ goto fail1;
++
++ if (dst_be) {
++ if (qe_dst_waiting(dst_be)) {
++ request_info(mbc->dc, "Request buffer already waiting on start\n");
++ goto fail1;
++ }
++ dst_be->base.timestamp = (struct timeval){0,0};
++ if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
++ goto fail1;
++
++ qent_dst_ref(dst_be);
++ queue_put_inuse(mbc->dst, &dst_be->base);
++ }
++
++ if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
++ goto fail1;
++ queue_put_inuse(mbc->src, &src_be->base);
++
++ if (!mbc->polling && mediabufs_wants_poll(mbc)) {
++ mbc->polling = true;
++ pollqueue_add_task(mbc->pt, 2000);
++ }
++ pthread_mutex_unlock(&mbc->lock);
++
++ if (media_request_start(mreq))
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++ return MEDIABUFS_STATUS_SUCCESS;
++
++fail1:
++ media_request_abort(&mreq);
++ if (src_be)
++ queue_put_free(mbc->src, &src_be->base);
++
++// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
++ if (dst_be) {
++ dst_be->base.status = QENT_ERROR;
++ qe_dst_done(dst_be);
++ }
++ pthread_mutex_unlock(&mbc->lock);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++static int qe_alloc_from_fmt(struct qent_base *const be,
++ struct dmabufs_ctl *const dbsc,
++ const struct v4l2_format *const fmt)
++{
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ unsigned int i;
++ for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
++ be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
++ fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
++ /* On failure tidy up and die */
++ if (!be->dh[i]) {
++ while (i--) {
++ dmabuf_free(be->dh[i]);
++ be->dh[i] = NULL;
++ }
++ return -1;
++ }
++ }
++ }
++ else {
++// be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
++ size_t size = fmt->fmt.pix.sizeimage;
++ be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
++ if (!be->dh[0])
++ return -1;
++ }
++ return 0;
++}
++
++static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
++ const enum v4l2_buf_type buftype,
++ uint32_t pixfmt,
++ const unsigned int width, const unsigned int height,
++ const size_t bufsize)
++{
++ *fmt = (struct v4l2_format){.type = buftype};
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++ fmt->fmt.pix_mp.width = width;
++ fmt->fmt.pix_mp.height = height;
++ fmt->fmt.pix_mp.pixelformat = pixfmt;
++ if (bufsize) {
++ fmt->fmt.pix_mp.num_planes = 1;
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
++ }
++ }
++ else {
++ fmt->fmt.pix.width = width;
++ fmt->fmt.pix.height = height;
++ fmt->fmt.pix.pixelformat = pixfmt;
++ fmt->fmt.pix.sizeimage = bufsize;
++ }
++
++ while (ioctl(fd, VIDIOC_S_FMT, fmt))
++ if (errno != EINTR)
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++ // Treat anything where we don't get at least what we asked for as a fail
++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++ if (fmt->fmt.pix_mp.width < width ||
++ fmt->fmt.pix_mp.height < height ||
++ fmt->fmt.pix_mp.pixelformat != pixfmt) {
++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++ }
++ }
++ else {
++ if (fmt->fmt.pix.width < width ||
++ fmt->fmt.pix.height < height ||
++ fmt->fmt.pix.pixelformat != pixfmt) {
++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++ }
++ }
++
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
++ const int fd,
++ const unsigned int type_v4l2,
++ const uint32_t flags_must,
++ const uint32_t flags_not,
++ const unsigned int width,
++ const unsigned int height,
++ mediabufs_dst_fmt_accept_fn *const accept_fn,
++ void *const accept_v)
++{
++ unsigned int i;
++
++ for (i = 0;; ++i) {
++ struct v4l2_fmtdesc fmtdesc = {
++ .index = i,
++ .type = type_v4l2
++ };
++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++ if (errno != EINTR)
++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++ }
++ if ((fmtdesc.flags & flags_must) != flags_must ||
++ (fmtdesc.flags & flags_not))
++ continue;
++ if (!accept_fn(accept_v, &fmtdesc))
++ continue;
++
++ if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
++ width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
++ return MEDIABUFS_STATUS_SUCCESS;
++ }
++ return 0;
++}
++
++
++/* Wait for qent done */
++
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
++{
++ struct qent_base *const be = &be_dst->base;
++ enum qent_status estat;
++
++ pthread_mutex_lock(&be_dst->lock);
++ while (be_dst->waiting &&
++ !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
++ /* Loop */;
++ estat = be->status;
++ pthread_mutex_unlock(&be_dst->lock);
++
++ return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
++ estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
++ MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
++{
++ struct qent_base *const be = &be_dst->base;
++ return dmabuf_map(be->dh[buf_no]);
++}
++
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
++{
++ struct qent_base *const be = &be_dst->base;
++ unsigned int i;
++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++ if (dmabuf_read_start(be->dh[i])) {
++ while (i--)
++ dmabuf_read_end(be->dh[i]);
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++ }
++ }
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
++{
++ struct qent_base *const be = &be_dst->base;
++ unsigned int i;
++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++ if (dmabuf_read_end(be->dh[i]))
++ status = MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++ return status;
++}
++
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
++{
++ if (be_dst)
++ atomic_fetch_add(&be_dst->base.ref_count, 1);
++ return be_dst;
++}
++
++void qent_dst_unref(struct qent_dst ** const pbe_dst)
++{
++ struct qent_dst * const be_dst = *pbe_dst;
++ struct mediabufs_ctl * mbc;
++ if (!be_dst)
++ return;
++ *pbe_dst = NULL;
++
++ if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
++ return;
++
++ if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
++ queue_put_free(mbc->dst, &be_dst->base);
++ ff_weak_link_unlock(be_dst->mbc_wl);
++ }
++ else {
++ qe_dst_free(be_dst);
++ }
++}
++
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++ unsigned int plane,
++ int fd, size_t size)
++{
++ struct qent_base *const be = &be_dst->base;
++ struct dmabuf_h * dh;
++
++ if (be->status != QENT_IMPORT || be->dh[plane])
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++ dh = dmabuf_import(fd, size);
++ if (!dh)
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++ be->dh[plane] = dh;
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++// Returns noof buffers created, -ve for error
++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
++{
++ unsigned int i;
++
++ struct v4l2_create_buffers cbuf = {
++ .count = n,
++ .memory = V4L2_MEMORY_DMABUF,
++ .format = mbc->dst_fmt,
++ };
++
++ while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
++ const int err = -errno;
++ if (err != EINTR) {
++ request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
++ return -err;
++ }
++ }
++
++ if (cbuf.count != n)
++ request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
++
++ for (i = 0; i != cbuf.count; ++i)
++ qes[i]->base.index = cbuf.index + i;
++
++ return cbuf.count;
++}
++
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
++{
++ struct qent_dst * be_dst;
++
++ if (mbc == NULL) {
++ be_dst = qe_dst_new(NULL);
++ if (be_dst)
++ be_dst->base.status = QENT_IMPORT;
++ return be_dst;
++ }
++
++ if (mbc->dst_fixed) {
++ be_dst = base_to_dst(queue_get_free(mbc->dst));
++ if (!be_dst)
++ return NULL;
++ }
++ else {
++ be_dst = base_to_dst(queue_tryget_free(mbc->dst));
++ if (!be_dst) {
++ be_dst = qe_dst_new(mbc->this_wlm);
++ if (!be_dst)
++ return NULL;
++
++ if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
++ qe_dst_free(be_dst);
++ return NULL;
++ }
++ }
++ }
++
++ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
++ /* Given how create buf works we can't uncreate it on alloc failure
++ * all we can do is put it on the free Q
++ */
++ queue_put_free(mbc->dst, &be_dst->base);
++ return NULL;
++ }
++
++ be_dst->base.status = QENT_PENDING;
++ atomic_store(&be_dst->base.ref_count, 0);
++ return be_dst;
++}
++
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
++{
++ return &mbc->dst_fmt;
++}
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++ const unsigned int width,
++ const unsigned int height,
++ mediabufs_dst_fmt_accept_fn *const accept_fn,
++ void *const accept_v)
++{
++ MediaBufsStatus status;
++ unsigned int i;
++ const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
++ static const struct {
++ unsigned int flags_must;
++ unsigned int flags_not;
++ } trys[] = {
++ {0, V4L2_FMT_FLAG_EMULATED},
++ {V4L2_FMT_FLAG_EMULATED, 0},
++ };
++ for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
++ status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
++ buf_type,
++ trys[i].flags_must,
++ trys[i].flags_not,
++ width, height, accept_fn, accept_v);
++ if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
++ return status;
++ }
++
++ if (status != MEDIABUFS_STATUS_SUCCESS)
++ return status;
++
++ /* Try to create a buffer - don't alloc */
++ return status;
++}
++
++// ** This is a mess if we get partial alloc but without any way to remove
++// individual V4L2 Q members we are somewhat stuffed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
++{
++ unsigned int i;
++ int a = 0;
++ unsigned int qc;
++ struct qent_dst * qes[32];
++
++ if (n > 32)
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++ // Create qents first as it is hard to get rid of the V4L2 buffers on error
++ for (qc = 0; qc != n; ++qc)
++ {
++ if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
++ goto fail;
++ }
++
++ if ((a = create_dst_bufs(mbc, n, qes)) < 0)
++ goto fail;
++
++ for (i = 0; i != a; ++i)
++ queue_put_free(mbc->dst, &qes[i]->base);
++
++ if (a != n)
++ goto fail;
++
++ mbc->dst_fixed = fixed;
++ return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++ for (i = (a < 0 ? 0 : a); i != qc; ++i)
++ qe_dst_free(qes[i]);
++
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++}
++
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
++{
++ struct qent_base * buf = queue_get_free(mbc->src);
++ buf->status = QENT_PENDING;
++ return base_to_src(buf);
++}
++
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
++{
++ struct qent_src *const qe_src = *pqe_src;
++ if (!qe_src)
++ return;
++ *pqe_src = NULL;
++ queue_put_free(mbc->src, &qe_src->base);
++}
++
++/* src format must have been set up before this */
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
++ struct dmabufs_ctl * const dbsc,
++ unsigned int n)
++{
++ unsigned int i;
++ struct v4l2_requestbuffers req = {
++ .count = n,
++ .type = mbc->src_fmt.type,
++ .memory = V4L2_MEMORY_DMABUF
++ };
++
++ bq_free_all_free_src(mbc->src);
++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
++ if (errno != EINTR) {
++ request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++ }
++
++ if (n > req.count) {
++ request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
++ n = req.count;
++ }
++
++ for (i = 0; i != n; ++i) {
++ struct qent_src *const be_src = qe_src_new();
++ if (!be_src) {
++ request_err(mbc->dc, "Failed to create src be %d\n", i);
++ goto fail;
++ }
++ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
++ qe_src_free(be_src);
++ goto fail;
++ }
++ be_src->base.index = i;
++ be_src->fixed_size = !mediabufs_src_resizable(mbc);
++
++ queue_put_free(mbc->src, &be_src->base);
++ }
++
++ return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++ bq_free_all_free_src(mbc->src);
++ req.count = 0;
++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
++ errno == EINTR)
++ /* Loop */;
++
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++
++/*
++ * Set stuff order:
++ * Set src fmt
++ * Set parameters (sps) on vfd
++ * Negotiate dst format (dst_fmt_set)
++ * Create src buffers
++ * Alloc a dst buffer or Create dst slots
++*/
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
++{
++ if (mbc->stream_on)
++ return MEDIABUFS_STATUS_SUCCESS;
++
++ if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
++ request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
++ request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
++ set_stream(mbc->vfd, mbc->src_fmt.type, false);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ mbc->stream_on = true;
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
++{
++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++ if (!mbc->stream_on)
++ return MEDIABUFS_STATUS_SUCCESS;
++
++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
++ request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
++ status = MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
++ request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
++ status = MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ mbc->stream_on = false;
++ return status;
++}
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
++{
++ struct v4l2_ext_controls controls = {
++ .controls = control_array,
++ .count = n
++ };
++
++ if (mreq) {
++ controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
++ controls.request_fd = media_request_fd(mreq);
++ }
++
++ while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
++ {
++ const int err = errno;
++ if (err != EINTR) {
++ request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
++ return -err;
++ }
++ }
++
++ return 0;
++}
++
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++ struct media_request * const mreq,
++ unsigned int id, void *data,
++ unsigned int size)
++{
++ struct v4l2_ext_control control = {
++ .id = id,
++ .ptr = data,
++ .size = size
++ };
++
++ int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
++ return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++ enum v4l2_buf_type buf_type,
++ const uint32_t pixfmt,
++ const uint32_t width, const uint32_t height,
++ const size_t bufsize)
++{
++ MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
++ if (rv != MEDIABUFS_STATUS_SUCCESS)
++ request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
++
++ return rv;
++}
++
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
++{
++ int rv = 0;
++ while (n--) {
++ while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
++ const int err = errno;
++ if (err != EINTR) {
++ // Often used for probing - errors are to be expected
++ request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
++ ctrls->type = 0; // 0 is invalid
++ rv = -err;
++ break;
++ }
++ }
++ ++ctrls;
++ }
++ return rv;
++}
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
++{
++ // Single planar OUTPUT can only take exact size buffers
++ // Multiplanar will take larger than negotiated
++ return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
++}
++
++static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
++{
++ if (!mbc)
++ return;
++
++ // Break the weak link first
++ ff_weak_link_break(&mbc->this_wlm);
++
++ polltask_delete(&mbc->pt);
++
++ mediabufs_stream_off(mbc);
++
++ // Empty v4l2 buffer stash
++ request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
++ request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
++
++ bq_free_all_free_src(mbc->src);
++ bq_free_all_inuse_src(mbc->src);
++ bq_free_all_free_dst(mbc->dst);
++
++ {
++ struct qent_dst *dst_be;
++ while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
++ dst_be->base.timestamp = (struct timeval){0};
++ dst_be->base.status = QENT_ERROR;
++ qe_dst_done(dst_be);
++ }
++ }
++
++ queue_delete(mbc->dst);
++ queue_delete(mbc->src);
++ close(mbc->vfd);
++ pthread_mutex_destroy(&mbc->lock);
++
++ free(mbc);
++}
++
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
++{
++ atomic_fetch_add(&mbc->ref_count, 1);
++ return mbc;
++}
++
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
++{
++ struct mediabufs_ctl *const mbc = *pmbc;
++ int n;
++
++ if (!mbc)
++ return;
++ *pmbc = NULL;
++ n = atomic_fetch_sub(&mbc->ref_count, 1);
++ if (n)
++ return;
++ mediabufs_ctl_delete(mbc);
++}
++
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
++{
++ return mbc->capability.version;
++}
++
++static int set_capabilities(struct mediabufs_ctl *const mbc)
++{
++ uint32_t caps;
++
++ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
++ int err = errno;
++ request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
++ return -err;
++ }
++
++ caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
++ mbc->capability.device_caps :
++ mbc->capability.capabilities;
++
++ if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++ }
++ else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++ }
++ else {
++ request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++/* One of these per context */
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
++{
++ struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
++
++ if (!mbc)
++ return NULL;
++
++ mbc->dc = dc;
++ // Default mono planar
++ mbc->pq = pq;
++ pthread_mutex_init(&mbc->lock, NULL);
++
++ /* Pick a default - could we scan for this? */
++ if (vpath == NULL)
++ vpath = "/dev/media0";
++
++ while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
++ {
++ const int err = errno;
++ if (err != EINTR) {
++ request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
++ goto fail0;
++ }
++ }
++
++ if (set_capabilities(mbc)) {
++ request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
++ goto fail1;
++ }
++
++ mbc->src = queue_new(mbc->vfd);
++ if (!mbc->src)
++ goto fail1;
++ mbc->dst = queue_new(mbc->vfd);
++ if (!mbc->dst)
++ goto fail2;
++ mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
++ if (!mbc->pt)
++ goto fail3;
++ mbc->this_wlm = ff_weak_link_new(mbc);
++ if (!mbc->this_wlm)
++ goto fail4;
++
++ /* Cannot add polltask now - polling with nothing pending
++ * generates infinite error polls
++ */
++ return mbc;
++
++fail4:
++ polltask_delete(&mbc->pt);
++fail3:
++ queue_delete(mbc->dst);
++fail2:
++ queue_delete(mbc->src);
++fail1:
++ close(mbc->vfd);
++fail0:
++ free(mbc);
++ request_info(dc, "%s: FAILED\n", __func__);
++ return NULL;
++}
++
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.h
+@@ -0,0 +1,154 @@
++/*
++e.h
++*
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef _MEDIA_H_
++#define _MEDIA_H_
++
++#include <stdbool.h>
++#include <stdint.h>
++
++struct v4l2_format;
++struct v4l2_fmtdesc;
++struct v4l2_query_ext_ctrl;
++
++struct pollqueue;
++struct media_request;
++struct media_pool;
++
++typedef enum media_buf_status {
++ MEDIABUFS_STATUS_SUCCESS = 0,
++ MEDIABUFS_ERROR_OPERATION_FAILED,
++ MEDIABUFS_ERROR_DECODING_ERROR,
++ MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
++ MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
++ MEDIABUFS_ERROR_ALLOCATION_FAILED,
++} MediaBufsStatus;
++
++struct media_pool * media_pool_new(const char * const media_path,
++ struct pollqueue * const pq,
++ const unsigned int n);
++void media_pool_delete(struct media_pool ** pmp);
++
++// Obtain a media request
++// Will block if none availible - has a 2sec timeout
++struct media_request * media_request_get(struct media_pool * const mp);
++int media_request_fd(const struct media_request * const req);
++
++// Start this request
++// Request structure is returned to pool once done
++int media_request_start(struct media_request * const req);
++
++// Return an *unstarted* media_request to the pool
++// May later be upgraded to allow for aborting a started req
++int media_request_abort(struct media_request ** const preq);
++
++
++struct mediabufs_ctl;
++struct qent_src;
++struct qent_dst;
++struct dmabuf_h;
++struct dmabufs_ctl;
++
++int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
++
++// prealloc
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
++// dbsc may be NULL if realloc not required
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
++int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
++void qent_dst_delete(struct qent_dst *const be);
++// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
++void qent_dst_unref(struct qent_dst ** const pbe_dst);
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
++
++const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
++/* Import an fd unattached to any mediabuf */
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++ unsigned int plane,
++ int fd, size_t size);
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++ struct media_request **const pmreq,
++ struct qent_src **const psrc_be,
++ struct qent_dst *const dst_be,
++ const bool is_final);
++// Get / alloc a dst buffer & associate with a slot
++// If the dst pool is empty then behaviour depends on the fixed flag passed to
++// dst_slots_create. Default is !fixed = unlimited alloc
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
++ struct dmabufs_ctl *const dbsc);
++// Create dst slots without alloc
++// If fixed true then qent_alloc will only get slots from this pool and will
++// block until a qent has been unrefed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
++
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
++
++typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++ const unsigned int width,
++ const unsigned int height,
++ mediabufs_dst_fmt_accept_fn *const accept_fn,
++ void *const accept_v);
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
++ struct v4l2_ext_control control_array[], unsigned int n);
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++ struct media_request * const mreq,
++ unsigned int id, void *data,
++ unsigned int size);
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++ enum v4l2_buf_type buf_type,
++ const uint32_t pixfmt,
++ const uint32_t width, const uint32_t height,
++ const size_t bufsize);
++
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
++ struct dmabufs_ctl * const dbsc,
++ unsigned int n);
++
++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
++
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
++ const char *vpath, struct pollqueue *const pq);
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
++
++
++#endif
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.c
+@@ -0,0 +1,361 @@
++#include <errno.h>
++#include <limits.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <string.h>
++#include <unistd.h>
++#include <sys/eventfd.h>
++
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++
++
++struct pollqueue;
++
++enum polltask_state {
++ POLLTASK_UNQUEUED = 0,
++ POLLTASK_QUEUED,
++ POLLTASK_RUNNING,
++ POLLTASK_Q_KILL,
++ POLLTASK_RUN_KILL,
++};
++
++struct polltask {
++ struct polltask *next;
++ struct polltask *prev;
++ struct pollqueue *q;
++ enum polltask_state state;
++
++ int fd;
++ short events;
++
++ void (*fn)(void *v, short revents);
++ void * v;
++
++ uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
++ sem_t kill_sem;
++};
++
++struct pollqueue {
++ atomic_int ref_count;
++ pthread_mutex_t lock;
++
++ struct polltask *head;
++ struct polltask *tail;
++
++ bool kill;
++ bool no_prod;
++ int prod_fd;
++ struct polltask *prod_pt;
++ pthread_t worker;
++};
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++ const int fd, const short events,
++ void (*const fn)(void *v, short revents),
++ void *const v)
++{
++ struct polltask *pt;
++
++ if (!events)
++ return NULL;
++
++ pt = malloc(sizeof(*pt));
++ if (!pt)
++ return NULL;
++
++ *pt = (struct polltask){
++ .next = NULL,
++ .prev = NULL,
++ .q = pollqueue_ref(pq),
++ .fd = fd,
++ .events = events,
++ .fn = fn,
++ .v = v
++ };
++
++ sem_init(&pt->kill_sem, 0, 0);
++
++ return pt;
++}
++
++static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
++{
++ if (pt->prev)
++ pt->prev->next = pt->next;
++ else
++ pq->head = pt->next;
++ if (pt->next)
++ pt->next->prev = pt->prev;
++ else
++ pq->tail = pt->prev;
++ pt->next = NULL;
++ pt->prev = NULL;
++}
++
++static void polltask_free(struct polltask * const pt)
++{
++ sem_destroy(&pt->kill_sem);
++ free(pt);
++}
++
++static int pollqueue_prod(const struct pollqueue *const pq)
++{
++ static const uint64_t one = 1;
++ return write(pq->prod_fd, &one, sizeof(one));
++}
++
++void polltask_delete(struct polltask **const ppt)
++{
++ struct polltask *const pt = *ppt;
++ struct pollqueue * pq;
++ enum polltask_state state;
++ bool prodme;
++
++ if (!pt)
++ return;
++
++ pq = pt->q;
++ pthread_mutex_lock(&pq->lock);
++ state = pt->state;
++ pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
++ prodme = !pq->no_prod;
++ pthread_mutex_unlock(&pq->lock);
++
++ if (state != POLLTASK_UNQUEUED) {
++ if (prodme)
++ pollqueue_prod(pq);
++ while (sem_wait(&pt->kill_sem) && errno == EINTR)
++ /* loop */;
++ }
++
++ // Leave zapping the ref until we have DQed the PT as might well be
++ // legitimately used in it
++ *ppt = NULL;
++ polltask_free(pt);
++ pollqueue_unref(&pq);
++}
++
++static uint64_t pollqueue_now(int timeout)
++{
++ struct timespec now;
++ uint64_t now_ms;
++
++ if (clock_gettime(CLOCK_MONOTONIC, &now))
++ return 0;
++ now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
++ return now_ms ? now_ms : (uint64_t)1;
++}
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout)
++{
++ bool prodme = false;
++ struct pollqueue * const pq = pt->q;
++
++ pthread_mutex_lock(&pq->lock);
++ if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
++ if (pq->tail)
++ pq->tail->next = pt;
++ else
++ pq->head = pt;
++ pt->prev = pq->tail;
++ pt->next = NULL;
++ pt->state = POLLTASK_QUEUED;
++ pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
++ pq->tail = pt;
++ prodme = !pq->no_prod;
++ }
++ pthread_mutex_unlock(&pq->lock);
++ if (prodme)
++ pollqueue_prod(pq);
++}
++
++static void *poll_thread(void *v)
++{
++ struct pollqueue *const pq = v;
++ struct pollfd *a = NULL;
++ size_t asize = 0;
++
++ pthread_mutex_lock(&pq->lock);
++ do {
++ unsigned int i;
++ unsigned int n = 0;
++ struct polltask *pt;
++ struct polltask *pt_next;
++ uint64_t now = pollqueue_now(0);
++ int timeout = -1;
++ int rv;
++
++ for (pt = pq->head; pt; pt = pt_next) {
++ int64_t t;
++
++ pt_next = pt->next;
++
++ if (pt->state == POLLTASK_Q_KILL) {
++ pollqueue_rem_task(pq, pt);
++ sem_post(&pt->kill_sem);
++ continue;
++ }
++
++ if (n >= asize) {
++ asize = asize ? asize * 2 : 4;
++ a = realloc(a, asize * sizeof(*a));
++ if (!a) {
++ request_log("Failed to realloc poll array to %zd\n", asize);
++ goto fail_locked;
++ }
++ }
++
++ a[n++] = (struct pollfd){
++ .fd = pt->fd,
++ .events = pt->events
++ };
++
++ t = (int64_t)(pt->timeout - now);
++ if (pt->timeout && t < INT_MAX &&
++ (timeout < 0 || (int)t < timeout))
++ timeout = (t < 0) ? 0 : (int)t;
++ }
++ pthread_mutex_unlock(&pq->lock);
++
++ if ((rv = poll(a, n, timeout)) == -1) {
++ if (errno != EINTR) {
++ request_log("Poll error: %s\n", strerror(errno));
++ goto fail_unlocked;
++ }
++ }
++
++ pthread_mutex_lock(&pq->lock);
++ now = pollqueue_now(0);
++
++ /* Prodding in this loop is pointless and might lead to
++ * infinite looping
++ */
++ pq->no_prod = true;
++ for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
++ pt_next = pt->next;
++
++ /* Pending? */
++ if (a[i].revents ||
++ (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
++ pollqueue_rem_task(pq, pt);
++ if (pt->state == POLLTASK_QUEUED)
++ pt->state = POLLTASK_RUNNING;
++ if (pt->state == POLLTASK_Q_KILL)
++ pt->state = POLLTASK_RUN_KILL;
++ pthread_mutex_unlock(&pq->lock);
++
++ /* This can add new entries to the Q but as
++ * those are added to the tail our existing
++ * chain remains intact
++ */
++ pt->fn(pt->v, a[i].revents);
++
++ pthread_mutex_lock(&pq->lock);
++ if (pt->state == POLLTASK_RUNNING)
++ pt->state = POLLTASK_UNQUEUED;
++ if (pt->state == POLLTASK_RUN_KILL)
++ sem_post(&pt->kill_sem);
++ }
++ }
++ pq->no_prod = false;
++
++ } while (!pq->kill);
++
++fail_locked:
++ pthread_mutex_unlock(&pq->lock);
++fail_unlocked:
++ free(a);
++ return NULL;
++}
++
++static void prod_fn(void *v, short revents)
++{
++ struct pollqueue *const pq = v;
++ char buf[8];
++ if (revents)
++ read(pq->prod_fd, buf, 8);
++ if (!pq->kill)
++ pollqueue_add_task(pq->prod_pt, -1);
++}
++
++struct pollqueue * pollqueue_new(void)
++{
++ struct pollqueue *pq = malloc(sizeof(*pq));
++ if (!pq)
++ return NULL;
++ *pq = (struct pollqueue){
++ .ref_count = ATOMIC_VAR_INIT(0),
++ .lock = PTHREAD_MUTEX_INITIALIZER,
++ .head = NULL,
++ .tail = NULL,
++ .kill = false,
++ .prod_fd = -1
++ };
++
++ pq->prod_fd = eventfd(0, EFD_NONBLOCK);
++ if (pq->prod_fd == 1)
++ goto fail1;
++ pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
++ if (!pq->prod_pt)
++ goto fail2;
++ pollqueue_add_task(pq->prod_pt, -1);
++ if (pthread_create(&pq->worker, NULL, poll_thread, pq))
++ goto fail3;
++ // Reset ref count which will have been inced by the add_task
++ atomic_store(&pq->ref_count, 0);
++ return pq;
++
++fail3:
++ polltask_free(pq->prod_pt);
++fail2:
++ close(pq->prod_fd);
++fail1:
++ free(pq);
++ return NULL;
++}
++
++static void pollqueue_free(struct pollqueue *const pq)
++{
++ void *rv;
++
++ pthread_mutex_lock(&pq->lock);
++ pq->kill = true;
++ pollqueue_prod(pq);
++ pthread_mutex_unlock(&pq->lock);
++
++ pthread_join(pq->worker, &rv);
++ polltask_free(pq->prod_pt);
++ pthread_mutex_destroy(&pq->lock);
++ close(pq->prod_fd);
++ free(pq);
++}
++
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
++{
++ atomic_fetch_add(&pq->ref_count, 1);
++ return pq;
++}
++
++void pollqueue_unref(struct pollqueue **const ppq)
++{
++ struct pollqueue * const pq = *ppq;
++
++ if (!pq)
++ return;
++ *ppq = NULL;
++
++ if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
++ return;
++
++ pollqueue_free(pq);
++}
++
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.h
+@@ -0,0 +1,18 @@
++#ifndef POLLQUEUE_H_
++#define POLLQUEUE_H_
++
++struct polltask;
++struct pollqueue;
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++ const int fd, const short events,
++ void (*const fn)(void *v, short revents),
++ void *const v);
++void polltask_delete(struct polltask **const ppt);
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout);
++struct pollqueue * pollqueue_new(void);
++void pollqueue_unref(struct pollqueue **const ppq);
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
++
++#endif /* POLLQUEUE_H_ */
+--- /dev/null
++++ b/libavcodec/v4l2_req_utils.h
+@@ -0,0 +1,22 @@
++#include "libavutil/log.h"
++
++#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
++
++#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
++#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
++#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
++
++static inline char safechar(char c) {
++ return c > 0x20 && c < 0x7f ? c : '.';
++}
++
++static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
++ tbuf[0] = safechar((fcc >> 0) & 0xff);
++ tbuf[1] = safechar((fcc >> 8) & 0xff);
++ tbuf[2] = safechar((fcc >> 16) & 0xff);
++ tbuf[3] = safechar((fcc >> 24) & 0xff);
++ tbuf[4] = '\0';
++ return tbuf;
++}
++
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -0,0 +1,315 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
++{
++ const size_t wxh = w * h;
++ size_t bits_alloc;
++
++ /* Annex A gives a min compression of 2 @ lvl 3.1
++ * (wxh <= 983040) and min 4 thereafter but avoid
++ * the odity of 983041 having a lower limit than
++ * 983040.
++ * Multiply by 3/2 for 4:2:0
++ */
++ bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
++ wxh < 983040 * 2 ? 983040 * 3 / 4 :
++ wxh * 3 / 8;
++ /* Allow for bit depth */
++ bits_alloc += (bits_alloc * bits_minus8) / 8;
++ /* Add a few bytes (16k) for overhead */
++ bits_alloc += 0x4000;
++ return bits_alloc;
++}
++
++static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
++ av_unused const uint8_t *buffer,
++ av_unused uint32_t size)
++{
++ const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->start_frame(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->decode_slice(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
++{
++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->end_frame(avctx);
++}
++
++static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ ctx->fns->abort_frame(avctx);
++}
++
++static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->frame_params(avctx, hw_frames_ctx);
++}
++
++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->alloc_frame(avctx, frame);
++}
++
++
++static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ decode_q_wait(&ctx->decode_q, NULL); // Wait for all other threads to be out of decode
++
++ mediabufs_ctl_unref(&ctx->mbufs);
++ media_pool_delete(&ctx->mpool);
++ pollqueue_unref(&ctx->pq);
++ dmabufs_ctl_delete(&ctx->dbufs);
++ devscan_delete(&ctx->devscan);
++
++ decode_q_uninit(&ctx->decode_q);
++
++// if (avctx->hw_frames_ctx) {
++// AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
++// av_buffer_pool_flush(hwfc->pool);
++// }
++ return 0;
++}
++
++static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
++{
++ AVCodecContext *const avctx = v;
++ const HEVCContext *const h = avctx->priv_data;
++
++ if (h->ps.sps->bit_depth == 8) {
++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
++ fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
++ return 1;
++ }
++ }
++ else if (h->ps.sps->bit_depth == 10) {
++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++ return 1;
++ }
++ }
++ return 0;
++}
++
++static int v4l2_request_hevc_init(AVCodecContext *avctx)
++{
++ const HEVCContext *h = avctx->priv_data;
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ const HEVCSPS * const sps = h->ps.sps;
++ int ret;
++ const struct decdev * decdev;
++ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes
++ size_t src_size;
++
++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ // Give up immediately if this is something that we have no code to deal with
++ if (h->ps.sps->chroma_format_idc != 1) {
++ av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
++ return AVERROR_PATCHWELCOME;
++ }
++ if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
++ h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
++ av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
++ return AVERROR_PATCHWELCOME;
++ }
++
++ if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
++ av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
++ return (AVERROR(-ret));
++ }
++ ret = AVERROR(ENOMEM); // Assume mem fail by default for these
++
++ if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
++ {
++ av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
++ ret = AVERROR(ENODEV);
++ goto fail0;
++ }
++ av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
++ decdev_media_path(decdev), decdev_video_path(decdev));
++
++ if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
++ goto fail0;
++ }
++
++ if ((ctx->pq = pollqueue_new()) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
++ goto fail1;
++ }
++
++ if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
++ goto fail2;
++ }
++
++ if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
++ goto fail3;
++ }
++
++ // Ask for an initial bitbuf size of max size / 4
++ // We will realloc if we need more
++ // Must use sps->h/w as avctx contains cropped size
++ src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
++ if (mediabufs_src_resizable(ctx->mbufs))
++ src_size /= 4;
++ // Kludge for conformance tests which break Annex A limits
++ else if (src_size < 0x40000)
++ src_size = 0x40000;
++
++ if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
++ sps->width, sps->height, src_size)) {
++ char tbuf1[5];
++ av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++ goto fail4;
++ }
++
++ if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 4);
++ }
++ else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 3);
++ }
++ else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 2);
++ }
++ else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 1);
++ }
++ else {
++ av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
++ ret = AVERROR(EINVAL);
++ goto fail4;
++ }
++
++ if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
++ char tbuf1[5];
++ av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++ goto fail4;
++ }
++
++ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
++ goto fail4;
++ }
++
++ {
++ unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
++ avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
++ av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
++ avctx->thread_count, avctx->extra_hw_frames);
++
++ // extra_hw_frames is -1 if unset
++ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
++ goto fail4;
++ }
++ }
++
++ if (mediabufs_stream_on(ctx->mbufs)) {
++ av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
++ goto fail4;
++ }
++
++ if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
++ goto fail4;
++ }
++
++ if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
++ goto fail5;
++ }
++
++ decode_q_init(&ctx->decode_q);
++
++ // Set our s/w format
++ avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
++
++ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
++ ctx->fns->name,
++ decdev_media_path(decdev), decdev_video_path(decdev));
++
++ return 0;
++
++fail5:
++ av_buffer_unref(&avctx->hw_frames_ctx);
++fail4:
++ mediabufs_ctl_unref(&ctx->mbufs);
++fail3:
++ media_pool_delete(&ctx->mpool);
++fail2:
++ pollqueue_unref(&ctx->pq);
++fail1:
++ dmabufs_ctl_delete(&ctx->dbufs);
++fail0:
++ devscan_delete(&ctx->devscan);
++ return ret;
++}
++
++const AVHWAccel ff_hevc_v4l2request_hwaccel = {
++ .name = "hevc_v4l2request",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .pix_fmt = AV_PIX_FMT_DRM_PRIME,
++ .alloc_frame = v4l2_req_hevc_alloc_frame,
++ .start_frame = v4l2_req_hevc_start_frame,
++ .decode_slice = v4l2_req_hevc_decode_slice,
++ .end_frame = v4l2_req_hevc_end_frame,
++ .abort_frame = v4l2_req_hevc_abort_frame,
++ .init = v4l2_request_hevc_init,
++ .uninit = v4l2_request_hevc_uninit,
++ .priv_data_size = sizeof(V4L2RequestContextHEVC),
++ .frame_params = v4l2_req_hevc_frame_params,
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.h
+@@ -0,0 +1,101 @@
++#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
++#define AVCODEC_V4L2_REQUEST_HEVC_H
++
++#include <drm_fourcc.h>
++#include "v4l2_req_decode_q.h"
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#include <linux/videodev2.h>
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in drm_fourcc.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
++#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
++#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800
++#endif
++
++#define VCAT(name, version) name##_v##version
++#define V2(n,v) VCAT(n, v)
++#define V(n) V2(n, HEVC_CTRLS_VERSION)
++
++#define S2(x) #x
++#define STR(x) S2(x)
++
++// 1 per decoder
++struct v4l2_req_decode_fns;
++
++typedef struct V4L2RequestContextHEVC {
++// V4L2RequestContext base;
++ const struct v4l2_req_decode_fns * fns;
++
++ unsigned int timestamp; // ?? maybe uint64_t
++
++ int decode_mode;
++ int start_code;
++ unsigned int max_slices; // 0 => not wanted (frame mode)
++ unsigned int max_offsets; // 0 => not wanted
++
++ req_decode_q decode_q;
++
++ struct devscan *devscan;
++ struct dmabufs_ctl *dbufs;
++ struct pollqueue *pq;
++ struct media_pool * mpool;
++ struct mediabufs_ctl *mbufs;
++} V4L2RequestContextHEVC;
++
++typedef struct v4l2_req_decode_fns {
++ int src_pix_fmt_v4l2;
++ const char * name;
++
++ // Init setup
++ int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++ int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++
++ // Passthrough of hwaccel fns
++ int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++ int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++ int (*end_frame)(AVCodecContext *avctx);
++ void (*abort_frame)(AVCodecContext *avctx);
++ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++ int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
++} v4l2_req_decode_fns;
++
++
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
++
++#endif
+--- a/libavcodec/vc1dec.c
++++ b/libavcodec/vc1dec.c
+@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCod
+ size = next - start - 4;
+ if (size <= 0)
+ continue;
+- buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
++ buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+ init_get_bits(&gb, buf2, buf2_size * 8);
+ switch (AV_RB32(start)) {
+ case VC1_CODE_SEQHDR:
+@@ -689,7 +689,7 @@ static int vc1_decode_frame(AVCodecConte
+ case VC1_CODE_FRAME:
+ if (avctx->hwaccel)
+ buf_start = start;
+- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+ break;
+ case VC1_CODE_FIELD: {
+ int buf_size3;
+@@ -706,8 +706,8 @@ static int vc1_decode_frame(AVCodecConte
+ ret = AVERROR(ENOMEM);
+ goto err;
+ }
+- buf_size3 = vc1_unescape_buffer(start + 4, size,
+- slices[n_slices].buf);
++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++ slices[n_slices].buf);
+ init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+ buf_size3 << 3);
+ slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
+@@ -718,7 +718,7 @@ static int vc1_decode_frame(AVCodecConte
+ break;
+ }
+ case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
+- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+ init_get_bits(&s->gb, buf2, buf_size2 * 8);
+ ff_vc1_decode_entry_point(avctx, v, &s->gb);
+ break;
+@@ -735,8 +735,8 @@ static int vc1_decode_frame(AVCodecConte
+ ret = AVERROR(ENOMEM);
+ goto err;
+ }
+- buf_size3 = vc1_unescape_buffer(start + 4, size,
+- slices[n_slices].buf);
++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++ slices[n_slices].buf);
+ init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+ buf_size3 << 3);
+ slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
+@@ -770,7 +770,7 @@ static int vc1_decode_frame(AVCodecConte
+ ret = AVERROR(ENOMEM);
+ goto err;
+ }
+- buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+ init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+ buf_size3 << 3);
+ slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+@@ -779,9 +779,9 @@ static int vc1_decode_frame(AVCodecConte
+ n_slices1 = n_slices - 1;
+ n_slices++;
+ }
+- buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
+ } else {
+- buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
+ }
+ init_get_bits(&s->gb, buf2, buf_size2*8);
+ } else
+--- a/libavcodec/vc1dsp.c
++++ b/libavcodec/vc1dsp.c
+@@ -32,6 +32,7 @@
+ #include "rnd_avg.h"
+ #include "vc1dsp.h"
+ #include "startcode.h"
++#include "vc1_common.h"
+
+ /* Apply overlap transform to horizontal edge */
+ static void vc1_v_overlap_c(uint8_t *src, int stride)
+@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContex
+ #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+
+ dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
++ dsp->vc1_unescape_buffer = vc1_unescape_buffer;
+
+ if (ARCH_AARCH64)
+ ff_vc1dsp_init_aarch64(dsp);
+--- a/libavcodec/vc1dsp.h
++++ b/libavcodec/vc1dsp.h
+@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
+ * one or more further zero bytes and a one byte.
+ */
+ int (*startcode_find_candidate)(const uint8_t *buf, int size);
++
++ /* Copy a buffer, removing startcode emulation escape bytes as we go */
++ int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
+ } VC1DSPContext;
+
+ void ff_vc1dsp_init(VC1DSPContext* c);
+--- /dev/null
++++ b/libavcodec/weak_link.c
+@@ -0,0 +1,102 @@
++#include <stdlib.h>
++#include <pthread.h>
++#include <stdatomic.h>
++#include "weak_link.h"
++
++struct ff_weak_link_master {
++ atomic_int ref_count; /* 0 is single ref for easier atomics */
++ pthread_rwlock_t lock;
++ void * ptr;
++};
++
++static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
++{
++ return (struct ff_weak_link_master *)c;
++}
++
++struct ff_weak_link_master * ff_weak_link_new(void * p)
++{
++ struct ff_weak_link_master * w = malloc(sizeof(*w));
++ if (!w)
++ return NULL;
++ w->ptr = p;
++ if (pthread_rwlock_init(&w->lock, NULL)) {
++ free(w);
++ return NULL;
++ }
++ return w;
++}
++
++static void weak_link_do_unref(struct ff_weak_link_master * const w)
++{
++ int n = atomic_fetch_sub(&w->ref_count, 1);
++ if (n)
++ return;
++
++ pthread_rwlock_destroy(&w->lock);
++ free(w);
++}
++
++// Unref & break link
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
++{
++ struct ff_weak_link_master * const w = *ppLink;
++ if (!w)
++ return;
++
++ *ppLink = NULL;
++ pthread_rwlock_wrlock(&w->lock);
++ w->ptr = NULL;
++ pthread_rwlock_unlock(&w->lock);
++
++ weak_link_do_unref(w);
++}
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
++{
++ if (!w)
++ return NULL;
++ atomic_fetch_add(&w->ref_count, 1);
++ return (struct ff_weak_link_client*)w;
++}
++
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
++{
++ struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++ if (!w)
++ return;
++
++ *ppLink = NULL;
++ weak_link_do_unref(w);
++}
++
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
++{
++ struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++
++ if (!w)
++ return NULL;
++
++ if (pthread_rwlock_rdlock(&w->lock))
++ goto broken;
++
++ if (w->ptr)
++ return w->ptr;
++
++ pthread_rwlock_unlock(&w->lock);
++
++broken:
++ *ppLink = NULL;
++ weak_link_do_unref(w);
++ return NULL;
++}
++
++// Ignores a NULL c (so can be on the return path of both broken & live links)
++void ff_weak_link_unlock(struct ff_weak_link_client * c)
++{
++ struct ff_weak_link_master * const w = weak_link_x(c);
++ if (w)
++ pthread_rwlock_unlock(&w->lock);
++}
++
++
+--- /dev/null
++++ b/libavcodec/weak_link.h
+@@ -0,0 +1,23 @@
++struct ff_weak_link_master;
++struct ff_weak_link_client;
++
++struct ff_weak_link_master * ff_weak_link_new(void * p);
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
++
++// Returns NULL if link broken - in this case it will also zap
++// *ppLink and unref the weak_link.
++// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
++//
++// The above does mean that there is a race if this is called simultainiously
++// by two threads using the same weak_link_client (so don't do that)
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
++void ff_weak_link_unlock(struct ff_weak_link_client * c);
++
++
++
++
++
++
+--- a/libavdevice/Makefile
++++ b/libavdevice/Makefile
+@@ -46,6 +46,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)
+ OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o
+ OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o
+ OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o
++OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o
++OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o
++OBJS-$(CONFIG_VOUT_RPI_OUTDEV) += rpi_vout.o
+ OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o
+ OBJS-$(CONFIG_XV_OUTDEV) += xv.o
+
+--- a/libavdevice/alldevices.c
++++ b/libavdevice/alldevices.c
+@@ -52,6 +52,9 @@ extern AVOutputFormat ff_sndio_muxer;
+ extern AVInputFormat ff_v4l2_demuxer;
+ extern AVOutputFormat ff_v4l2_muxer;
+ extern AVInputFormat ff_vfwcap_demuxer;
++extern AVOutputFormat ff_vout_drm_muxer;
++extern AVOutputFormat ff_vout_egl_muxer;
++extern AVOutputFormat ff_vout_rpi_muxer;
+ extern AVInputFormat ff_xcbgrab_demuxer;
+ extern AVOutputFormat ff_xv_muxer;
+
+--- /dev/null
++++ b/libavdevice/drm_vout.c
+@@ -0,0 +1,643 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++// limited to testing.
++
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <unistd.h>
++
++#include <xf86drm.h>
++#include <xf86drmMode.h>
++
++#define TRACE_ALL 0
++
++#define DRM_MODULE "vc4"
++
++#define ERRSTR strerror(errno)
++
++struct drm_setup {
++ int conId;
++ uint32_t crtcId;
++ int crtcIdx;
++ uint32_t planeId;
++ unsigned int out_fourcc;
++ struct {
++ int x, y, width, height;
++ } compose;
++};
++
++typedef struct drm_aux_s {
++ unsigned int fb_handle;
++ uint32_t bo_handles[AV_DRM_MAX_PLANES];
++ AVFrame * frame;
++} drm_aux_t;
++
++// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
++// we get initial flicker probably due to dodgy drm timing
++#define AUX_SIZE 3
++typedef struct drm_display_env_s
++{
++ AVClass *class;
++
++ int drm_fd;
++ uint32_t con_id;
++ struct drm_setup setup;
++ enum AVPixelFormat avfmt;
++ int show_all;
++
++ unsigned int ano;
++ drm_aux_t aux[AUX_SIZE];
++
++ pthread_t q_thread;
++ sem_t q_sem_in;
++ sem_t q_sem_out;
++ int q_terminate;
++ AVFrame * q_next;
++
++} drm_display_env_t;
++
++
++static int drm_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++ return 0;
++}
++
++static int drm_vout_write_header(AVFormatContext *s)
++{
++ const AVCodecParameters * const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++ if ( s->nb_streams > 1
++ || par->codec_type != AVMEDIA_TYPE_VIDEO
++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) {
++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++ return AVERROR(EINVAL);
++ }
++
++ return 0;
++}
++
++static int find_plane(struct AVFormatContext * const avctx,
++ const int drmfd, const int crtcidx, const uint32_t format,
++ uint32_t * const pplane_id)
++{
++ drmModePlaneResPtr planes;
++ drmModePlanePtr plane;
++ unsigned int i;
++ unsigned int j;
++ int ret = 0;
++
++ planes = drmModeGetPlaneResources(drmfd);
++ if (!planes)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
++ return -1;
++ }
++
++ for (i = 0; i < planes->count_planes; ++i) {
++ plane = drmModeGetPlane(drmfd, planes->planes[i]);
++ if (!planes)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
++ break;
++ }
++
++ if (!(plane->possible_crtcs & (1 << crtcidx))) {
++ drmModeFreePlane(plane);
++ continue;
++ }
++
++ for (j = 0; j < plane->count_formats; ++j) {
++ if (plane->formats[j] == format)
++ break;
++ }
++
++ if (j == plane->count_formats) {
++ drmModeFreePlane(plane);
++ continue;
++ }
++
++ *pplane_id = plane->plane_id;
++ drmModeFreePlane(plane);
++ break;
++ }
++
++ if (i == planes->count_planes)
++ ret = -1;
++
++ drmModeFreePlaneResources(planes);
++ return ret;
++}
++
++static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
++{
++ if (da->fb_handle != 0) {
++ drmModeRmFB(de->drm_fd, da->fb_handle);
++ da->fb_handle = 0;
++ }
++
++ for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
++ if (da->bo_handles[i]) {
++ struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
++ drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++ da->bo_handles[i] = 0;
++ }
++ }
++ av_frame_free(&da->frame);
++}
++
++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
++{
++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
++ drm_aux_t * da = de->aux + de->ano;
++ const uint32_t format = desc->layers[0].format;
++ int ret = 0;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
++#endif
++
++ if (de->setup.out_fourcc != format) {
++ if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
++ av_frame_free(&frame);
++ av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
++ return -1;
++ }
++ de->setup.out_fourcc = format;
++ }
++
++ {
++ drmVBlank vbl = {
++ .request = {
++ .type = DRM_VBLANK_RELATIVE,
++ .sequence = 0
++ }
++ };
++
++ while (drmWaitVBlank(de->drm_fd, &vbl)) {
++ if (errno != EINTR) {
++// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
++ break;
++ }
++ }
++ }
++
++ da_uninit(de, da);
++
++ {
++ uint32_t pitches[4] = {0};
++ uint32_t offsets[4] = {0};
++ uint64_t modifiers[4] = {0};
++ uint32_t bo_handles[4] = {0};
++ int i, j, n;
++
++ da->frame = frame;
++
++ for (i = 0; i < desc->nb_objects; ++i) {
++ if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
++ av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
++ return -1;
++ }
++ }
++
++ n = 0;
++ for (i = 0; i < desc->nb_layers; ++i) {
++ for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
++ pitches[n] = p->pitch;
++ offsets[n] = p->offset;
++ modifiers[n] = obj->format_modifier;
++ bo_handles[n] = da->bo_handles[p->object_index];
++ ++n;
++ }
++ }
++
++#if 1 && TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
++ av_frame_cropped_width(frame),
++ av_frame_cropped_height(frame),
++ desc->layers[0].format,
++ bo_handles[0],
++ bo_handles[1],
++ bo_handles[2],
++ bo_handles[3],
++ pitches[0],
++ pitches[1],
++ pitches[2],
++ pitches[3],
++ offsets[0],
++ offsets[1],
++ offsets[2],
++ offsets[3],
++ (long long)modifiers[0],
++ (long long)modifiers[1],
++ (long long)modifiers[2],
++ (long long)modifiers[3]
++ );
++#endif
++
++ if (drmModeAddFB2WithModifiers(de->drm_fd,
++ av_frame_cropped_width(frame),
++ av_frame_cropped_height(frame),
++ desc->layers[0].format, bo_handles,
++ pitches, offsets, modifiers,
++ &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
++ av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
++ return -1;
++ }
++ }
++
++ ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
++ da->fb_handle, 0,
++ de->setup.compose.x, de->setup.compose.y,
++ de->setup.compose.width,
++ de->setup.compose.height,
++ 0, 0,
++ av_frame_cropped_width(frame) << 16,
++ av_frame_cropped_height(frame) << 16);
++
++ if (ret != 0) {
++ av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
++ }
++
++ de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
++
++ return ret;
++}
++
++static int do_sem_wait(sem_t * const sem, const int nowait)
++{
++ while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
++ if (errno != EINTR)
++ return -errno;
++ }
++ return 0;
++}
++
++static void * display_thread(void * v)
++{
++ AVFormatContext * const s = v;
++ drm_display_env_t * const de = s->priv_data;
++ int i;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++#endif
++
++ sem_post(&de->q_sem_out);
++
++ for (;;) {
++ AVFrame * frame;
++
++ do_sem_wait(&de->q_sem_in, 0);
++
++ if (de->q_terminate)
++ break;
++
++ frame = de->q_next;
++ de->q_next = NULL;
++ sem_post(&de->q_sem_out);
++
++ do_display(s, de, frame);
++ }
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++#endif
++
++ for (i = 0; i != AUX_SIZE; ++i)
++ da_uninit(de, de->aux + i);
++
++ av_frame_free(&de->q_next);
++
++ return NULL;
++}
++
++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++ const AVFrame * const src_frame = (AVFrame *)pkt->data;
++ AVFrame * frame;
++ drm_display_env_t * const de = s->priv_data;
++ int ret;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++ if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
++ av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
++ return 0;
++ }
++
++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++ frame = av_frame_alloc();
++ av_frame_ref(frame, src_frame);
++ }
++ else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++ frame = av_frame_alloc();
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ if (av_hwframe_map(frame, src_frame, 0) != 0)
++ {
++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++ av_frame_free(&frame);
++ return AVERROR(EINVAL);
++ }
++ }
++ else {
++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++ return AVERROR(EINVAL);
++ }
++
++ ret = do_sem_wait(&de->q_sem_out, !de->show_all);
++ if (ret) {
++ av_frame_free(&frame);
++ }
++ else {
++ de->q_next = frame;
++ sem_post(&de->q_sem_in);
++ }
++
++ return 0;
++}
++
++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++ unsigned flags)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++ /* drm_vout_write_header() should have accepted only supported formats */
++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++ return 0;
++
++ return 0;
++}
++
++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
++#endif
++ switch(type) {
++ case AV_APP_TO_DEV_WINDOW_REPAINT:
++ return 0;
++ default:
++ break;
++ }
++ return AVERROR(ENOSYS);
++}
++
++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
++{
++ int ret = -1;
++ int i;
++ drmModeRes *res = drmModeGetResources(drmfd);
++ drmModeConnector *c;
++
++ if(!res)
++ {
++ printf( "drmModeGetResources failed: %s\n", ERRSTR);
++ return -1;
++ }
++
++ if (res->count_crtcs <= 0)
++ {
++ printf( "drm: no crts\n");
++ goto fail_res;
++ }
++
++ if (!s->conId) {
++ fprintf(stderr,
++ "No connector ID specified. Choosing default from list:\n");
++
++ for (i = 0; i < res->count_connectors; i++) {
++ drmModeConnector *con =
++ drmModeGetConnector(drmfd, res->connectors[i]);
++ drmModeEncoder *enc = NULL;
++ drmModeCrtc *crtc = NULL;
++
++ if (con->encoder_id) {
++ enc = drmModeGetEncoder(drmfd, con->encoder_id);
++ if (enc->crtc_id) {
++ crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
++ }
++ }
++
++ if (!s->conId && crtc) {
++ s->conId = con->connector_id;
++ s->crtcId = crtc->crtc_id;
++ }
++
++ av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
++ con->connector_id,
++ crtc ? crtc->crtc_id : 0,
++ con->connector_type,
++ crtc ? crtc->width : 0,
++ crtc ? crtc->height : 0,
++ (s->conId == (int)con->connector_id ?
++ " (chosen)" : ""));
++ }
++
++ if (!s->conId) {
++ av_log(avctx, AV_LOG_ERROR,
++ "No suitable enabled connector found.\n");
++ return -1;;
++ }
++ }
++
++ s->crtcIdx = -1;
++
++ for (i = 0; i < res->count_crtcs; ++i) {
++ if (s->crtcId == res->crtcs[i]) {
++ s->crtcIdx = i;
++ break;
++ }
++ }
++
++ if (s->crtcIdx == -1)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
++ goto fail_res;
++ }
++
++ if (res->count_connectors <= 0)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
++ goto fail_res;
++ }
++
++ c = drmModeGetConnector(drmfd, s->conId);
++ if (!c)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
++ goto fail_res;
++ }
++
++ if (!c->count_modes)
++ {
++ av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
++ goto fail_conn;
++ }
++
++ {
++ drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
++ s->compose.x = crtc->x;
++ s->compose.y = crtc->y;
++ s->compose.width = crtc->width;
++ s->compose.height = crtc->height;
++ drmModeFreeCrtc(crtc);
++ }
++
++ if (pConId)
++ *pConId = c->connector_id;
++ ret = 0;
++
++fail_conn:
++ drmModeFreeConnector(c);
++
++fail_res:
++ drmModeFreeResources(res);
++
++ return ret;
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int drm_vout_init(struct AVFormatContext * s)
++{
++ drm_display_env_t * const de = s->priv_data;
++ int rv;
++ const char * drm_module = DRM_MODULE;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->drm_fd = -1;
++ de->con_id = 0;
++ de->setup = (struct drm_setup){0};
++ de->q_terminate = 0;
++
++ if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
++ {
++ rv = AVERROR(errno);
++ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
++ return rv;
++ }
++
++ if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
++ {
++ av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
++ rv = AVERROR(EINVAL);
++ goto fail_close;
++ }
++
++ sem_init(&de->q_sem_in, 0, 0);
++ sem_init(&de->q_sem_out, 0, 0);
++ if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
++ rv = AVERROR(errno);
++ av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
++ goto fail_close;
++ }
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++ return 0;
++
++fail_close:
++ close(de->drm_fd);
++ de->drm_fd = -1;
++ av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
++
++ return rv;
++}
++
++static void drm_vout_deinit(struct AVFormatContext * s)
++{
++ drm_display_env_t * const de = s->priv_data;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->q_terminate = 1;
++ sem_post(&de->q_sem_in);
++ pthread_join(de->q_thread, NULL);
++ sem_destroy(&de->q_sem_in);
++ sem_destroy(&de->q_sem_out);
++
++ for (unsigned int i = 0; i != AUX_SIZE; ++i)
++ da_uninit(de, de->aux + i);
++
++ av_frame_free(&de->q_next);
++
++ if (de->drm_fd >= 0) {
++ close(de->drm_fd);
++ de->drm_fd = -1;
++ }
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++
++#define OFFSET(x) offsetof(drm_display_env_t, x)
++static const AVOption options[] = {
++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { NULL }
++};
++
++static const AVClass drm_vout_class = {
++ .class_name = "drm vid outdev",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_drm_muxer = {
++ .name = "vout_drm",
++ .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"),
++ .priv_data_size = sizeof(drm_display_env_t),
++ .audio_codec = AV_CODEC_ID_NONE,
++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME,
++ .write_header = drm_vout_write_header,
++ .write_packet = drm_vout_write_packet,
++ .write_uncoded_frame = drm_vout_write_frame,
++ .write_trailer = drm_vout_write_trailer,
++ .control_message = drm_vout_control_message,
++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++ .priv_class = &drm_vout_class,
++ .init = drm_vout_init,
++ .deinit = drm_vout_deinit,
++};
++
+--- /dev/null
++++ b/libavdevice/egl_vout.c
+@@ -0,0 +1,816 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++// limited to testing.
++// Amongst other issues it doesn't wait for the pic to be displayed before
++// returning the buffer so flikering does occur.
++
++#include <epoxy/gl.h>
++#include <epoxy/egl.h>
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <unistd.h>
++
++#include <X11/Xlib.h>
++#include <X11/Xutil.h>
++
++#include "libavutil/rpi_sand_fns.h"
++
++#define TRACE_ALL 0
++
++struct egl_setup {
++ int conId;
++
++ Display *dpy;
++ EGLDisplay egl_dpy;
++ EGLContext ctx;
++ EGLSurface surf;
++ Window win;
++
++ uint32_t crtcId;
++ int crtcIdx;
++ uint32_t planeId;
++ struct {
++ int x, y, width, height;
++ } compose;
++};
++
++typedef struct egl_aux_s {
++ int fd;
++ GLuint texture;
++
++} egl_aux_t;
++
++typedef struct egl_display_env_s
++{
++ AVClass *class;
++
++ struct egl_setup setup;
++ enum AVPixelFormat avfmt;
++
++ int show_all;
++ int window_width, window_height;
++ int window_x, window_y;
++ int fullscreen;
++
++ egl_aux_t aux[32];
++
++ pthread_t q_thread;
++ pthread_mutex_t q_lock;
++ sem_t display_start_sem;
++ sem_t q_sem;
++ int q_terminate;
++ AVFrame * q_this;
++ AVFrame * q_next;
++
++} egl_display_env_t;
++
++
++/**
++ * Remove window border/decorations.
++ */
++static void
++no_border( Display *dpy, Window w)
++{
++ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
++ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
++
++ typedef struct
++ {
++ unsigned long flags;
++ unsigned long functions;
++ unsigned long decorations;
++ long inputMode;
++ unsigned long status;
++ } PropMotifWmHints;
++
++ PropMotifWmHints motif_hints;
++ Atom prop, proptype;
++ unsigned long flags = 0;
++
++ /* setup the property */
++ motif_hints.flags = MWM_HINTS_DECORATIONS;
++ motif_hints.decorations = flags;
++
++ /* get the atom for the property */
++ prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
++ if (!prop) {
++ /* something went wrong! */
++ return;
++ }
++
++ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
++ proptype = prop;
++
++ XChangeProperty( dpy, w, /* display, window */
++ prop, proptype, /* property, type */
++ 32, /* format: 32-bit datums */
++ PropModeReplace, /* mode */
++ (unsigned char *) &motif_hints, /* data */
++ PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */
++ );
++}
++
++
++/*
++ * Create an RGB, double-buffered window.
++ * Return the window and context handles.
++ */
++static int
++make_window(struct AVFormatContext * const s,
++ egl_display_env_t * const de,
++ Display *dpy, EGLDisplay egl_dpy, const char *name,
++ Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
++{
++ int scrnum = DefaultScreen( dpy );
++ XSetWindowAttributes attr;
++ unsigned long mask;
++ Window root = RootWindow( dpy, scrnum );
++ Window win;
++ EGLContext ctx;
++ const int fullscreen = de->fullscreen;
++ EGLConfig config;
++ int x = de->window_x;
++ int y = de->window_y;
++ int width = de->window_width ? de->window_width : 1280;
++ int height = de->window_height ? de->window_height : 720;
++
++
++ if (fullscreen) {
++ int scrnum = DefaultScreen(dpy);
++
++ x = 0; y = 0;
++ width = DisplayWidth(dpy, scrnum);
++ height = DisplayHeight(dpy, scrnum);
++ }
++
++ {
++ EGLint num_configs;
++ static const EGLint attribs[] = {
++ EGL_RED_SIZE, 1,
++ EGL_GREEN_SIZE, 1,
++ EGL_BLUE_SIZE, 1,
++ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
++ EGL_NONE
++ };
++
++ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
++ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
++ return -1;
++ }
++ }
++
++ {
++ EGLint vid;
++ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
++ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
++ return -1;
++ }
++
++ {
++ XVisualInfo visTemplate = {
++ .visualid = vid,
++ };
++ int num_visuals;
++ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
++ &visTemplate, &num_visuals);
++
++ /* window attributes */
++ attr.background_pixel = 0;
++ attr.border_pixel = 0;
++ attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
++ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
++ /* XXX this is a bad way to get a borderless window! */
++ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
++
++ win = XCreateWindow( dpy, root, x, y, width, height,
++ 0, visinfo->depth, InputOutput,
++ visinfo->visual, mask, &attr );
++ XFree(visinfo);
++ }
++ }
++
++ if (fullscreen)
++ no_border(dpy, win);
++
++ /* set hints and properties */
++ {
++ XSizeHints sizehints;
++ sizehints.x = x;
++ sizehints.y = y;
++ sizehints.width = width;
++ sizehints.height = height;
++ sizehints.flags = USSize | USPosition;
++ XSetNormalHints(dpy, win, &sizehints);
++ XSetStandardProperties(dpy, win, name, name,
++ None, (char **)NULL, 0, &sizehints);
++ }
++
++ eglBindAPI(EGL_OPENGL_ES_API);
++
++ {
++ static const EGLint ctx_attribs[] = {
++ EGL_CONTEXT_CLIENT_VERSION, 2,
++ EGL_NONE
++ };
++ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
++ if (!ctx) {
++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++ return -1;
++ }
++ }
++
++
++ XMapWindow(dpy, win);
++
++ {
++ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
++ if (!surf) {
++ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
++ return -1;
++ }
++
++ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++ return -1;
++ }
++
++ *winRet = win;
++ *ctxRet = ctx;
++ *surfRet = surf;
++ }
++
++ return 0;
++}
++
++static GLint
++compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
++{
++ GLuint s = glCreateShader(target);
++
++ if (s == 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
++ return 0;
++ }
++
++ glShaderSource(s, 1, (const GLchar **) &source, NULL);
++ glCompileShader(s);
++
++ {
++ GLint ok;
++ glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
++
++ if (!ok) {
++ GLchar *info;
++ GLint size;
++
++ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
++ info = malloc(size);
++
++ glGetShaderInfoLog(s, size, NULL, info);
++ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
++
++ return 0;
++ }
++ }
++
++ return s;
++}
++
++static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
++{
++ GLuint prog = glCreateProgram();
++
++ if (prog == 0) {
++ av_log(s, AV_LOG_ERROR, "Failed to create program\n");
++ return 0;
++ }
++
++ glAttachShader(prog, vs);
++ glAttachShader(prog, fs);
++ glLinkProgram(prog);
++
++ {
++ GLint ok;
++ glGetProgramiv(prog, GL_LINK_STATUS, &ok);
++ if (!ok) {
++ /* Some drivers return a size of 1 for an empty log. This is the size
++ * of a log that contains only a terminating NUL character.
++ */
++ GLint size;
++ GLchar *info = NULL;
++ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
++ if (size > 1) {
++ info = malloc(size);
++ glGetProgramInfoLog(prog, size, NULL, info);
++ }
++
++ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
++ (info != NULL) ? info : "<empty log>");
++ return 0;
++ }
++ }
++
++ return prog;
++}
++
++static int
++gl_setup(struct AVFormatContext * const s)
++{
++ const char *vs =
++ "attribute vec4 pos;\n"
++ "varying vec2 texcoord;\n"
++ "\n"
++ "void main() {\n"
++ " gl_Position = pos;\n"
++ " texcoord.x = (pos.x + 1.0) / 2.0;\n"
++ " texcoord.y = (-pos.y + 1.0) / 2.0;\n"
++ "}\n";
++ const char *fs =
++ "#extension GL_OES_EGL_image_external : enable\n"
++ "precision mediump float;\n"
++ "uniform samplerExternalOES s;\n"
++ "varying vec2 texcoord;\n"
++ "void main() {\n"
++ " gl_FragColor = texture2D(s, texcoord);\n"
++ "}\n";
++
++ GLuint vs_s;
++ GLuint fs_s;
++ GLuint prog;
++
++ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
++ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
++ !(prog = link_program(s, vs_s, fs_s)))
++ return -1;
++
++ glUseProgram(prog);
++
++ {
++ static const float verts[] = {
++ -1, -1,
++ 1, -1,
++ 1, 1,
++ -1, 1,
++ };
++ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
++ }
++
++ glEnableVertexAttribArray(0);
++ return 0;
++}
++
++static int egl_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++ return 0;
++}
++
++static int egl_vout_write_header(AVFormatContext *s)
++{
++ const AVCodecParameters * const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++ if ( s->nb_streams > 1
++ || par->codec_type != AVMEDIA_TYPE_VIDEO
++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) {
++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++ return AVERROR(EINVAL);
++ }
++
++ return 0;
++}
++
++
++static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
++{
++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
++ egl_aux_t * da = NULL;
++ unsigned int i;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++
++ for (i = 0; i != 32; ++i) {
++ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
++ da = de->aux + i;
++ break;
++ }
++ }
++
++ if (da == NULL) {
++ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
++ return AVERROR(EINVAL);
++ }
++
++ if (da->texture == 0) {
++ EGLint attribs[50];
++ EGLint * a = attribs;
++ int i, j;
++ static const EGLint anames[] = {
++ EGL_DMA_BUF_PLANE0_FD_EXT,
++ EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE0_PITCH_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE1_FD_EXT,
++ EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE1_PITCH_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE2_FD_EXT,
++ EGL_DMA_BUF_PLANE2_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE2_PITCH_EXT,
++ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
++ };
++ const EGLint * b = anames;
++
++ *a++ = EGL_WIDTH;
++ *a++ = av_frame_cropped_width(frame);
++ *a++ = EGL_HEIGHT;
++ *a++ = av_frame_cropped_height(frame);
++ *a++ = EGL_LINUX_DRM_FOURCC_EXT;
++ *a++ = desc->layers[0].format;
++
++ for (i = 0; i < desc->nb_layers; ++i) {
++ for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
++ *a++ = *b++;
++ *a++ = obj->fd;
++ *a++ = *b++;
++ *a++ = p->offset;
++ *a++ = *b++;
++ *a++ = p->pitch;
++ if (obj->format_modifier == 0) {
++ b += 2;
++ }
++ else {
++ *a++ = *b++;
++ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
++ *a++ = *b++;
++ *a++ = (EGLint)(obj->format_modifier >> 32);
++ }
++ }
++ }
++
++ *a = EGL_NONE;
++
++#if TRACE_ALL
++ for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
++ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
++ }
++#endif
++ {
++ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
++ EGL_NO_CONTEXT,
++ EGL_LINUX_DMA_BUF_EXT,
++ NULL, attribs);
++ if (!image) {
++ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
++ return -1;
++ }
++
++ glGenTextures(1, &da->texture);
++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
++ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
++
++ eglDestroyImageKHR(de->setup.egl_dpy, image);
++ }
++
++ da->fd = desc->objects[0].fd;
++
++#if 0
++ av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
++ av_frame_cropped_width(frame),
++ av_frame_cropped_height(frame),
++ desc->layers[0].format,
++ bo_plane_handles[0],
++ bo_plane_handles[1],
++ bo_plane_handles[2],
++ bo_plane_handles[3],
++ pitches[0],
++ pitches[1],
++ pitches[2],
++ pitches[3],
++ offsets[0],
++ offsets[1],
++ offsets[2],
++ offsets[3],
++ (long long)modifiers[0],
++ (long long)modifiers[1],
++ (long long)modifiers[2],
++ (long long)modifiers[3]
++ );
++#endif
++ }
++
++ glClearColor(0.5, 0.5, 0.5, 0.5);
++ glClear(GL_COLOR_BUFFER_BIT);
++
++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++ glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
++ eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
++
++ glDeleteTextures(1, &da->texture);
++ da->texture = 0;
++ da->fd = -1;
++
++ return 0;
++}
++
++static void * display_thread(void * v)
++{
++ AVFormatContext * const s = v;
++ egl_display_env_t * const de = s->priv_data;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++ {
++ EGLint egl_major, egl_minor;
++
++ de->setup.dpy = XOpenDisplay(NULL);
++ if (!de->setup.dpy) {
++ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
++ goto fail;
++ }
++
++ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
++ if (!de->setup.egl_dpy) {
++ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
++ goto fail;
++ }
++
++ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
++ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
++ goto fail;
++ }
++
++ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
++
++ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
++ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
++ goto fail;
++ }
++ }
++
++ if (!de->window_width || !de->window_height) {
++ de->window_width = 1280;
++ de->window_height = 720;
++ }
++ if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
++ &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
++ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
++ goto fail;
++ }
++
++ if (gl_setup(s)) {
++ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
++ goto fail;
++ }
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
++#endif
++ sem_post(&de->display_start_sem);
++
++ for (;;) {
++ AVFrame * frame;
++
++ while (sem_wait(&de->q_sem) != 0) {
++ av_assert0(errno == EINTR);
++ }
++
++ if (de->q_terminate)
++ break;
++
++ pthread_mutex_lock(&de->q_lock);
++ frame = de->q_next;
++ de->q_next = NULL;
++ pthread_mutex_unlock(&de->q_lock);
++
++ do_display(s, de, frame);
++
++ av_frame_free(&de->q_this);
++ de->q_this = frame;
++ }
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
++#endif
++
++ return NULL;
++
++fail:
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
++#endif
++ de->q_terminate = 1;
++ sem_post(&de->display_start_sem);
++
++ return NULL;
++}
++
++static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++ const AVFrame * const src_frame = (AVFrame *)pkt->data;
++ AVFrame * frame;
++ egl_display_env_t * const de = s->priv_data;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++ frame = av_frame_alloc();
++ av_frame_ref(frame, src_frame);
++ }
++ else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++ frame = av_frame_alloc();
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ if (av_hwframe_map(frame, src_frame, 0) != 0)
++ {
++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++ av_frame_free(&frame);
++ return AVERROR(EINVAL);
++ }
++ }
++ else {
++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++ return AVERROR(EINVAL);
++ }
++
++ // Really hacky sync
++ while (de->show_all && de->q_next) {
++ usleep(3000);
++ }
++
++ pthread_mutex_lock(&de->q_lock);
++ {
++ AVFrame * const t = de->q_next;
++ de->q_next = frame;
++ frame = t;
++ }
++ pthread_mutex_unlock(&de->q_lock);
++
++ if (frame == NULL)
++ sem_post(&de->q_sem);
++ else
++ av_frame_free(&frame);
++
++ return 0;
++}
++
++static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++ unsigned flags)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++ /* egl_vout_write_header() should have accepted only supported formats */
++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++ return 0;
++
++ return 0;
++}
++
++static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++ switch(type) {
++ case AV_APP_TO_DEV_WINDOW_REPAINT:
++ return 0;
++ default:
++ break;
++ }
++ return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int egl_vout_init(struct AVFormatContext * s)
++{
++ egl_display_env_t * const de = s->priv_data;
++ unsigned int i;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->setup = (struct egl_setup){0};
++
++ for (i = 0; i != 32; ++i) {
++ de->aux[i].fd = -1;
++ }
++
++ de->q_terminate = 0;
++ pthread_mutex_init(&de->q_lock, NULL);
++ sem_init(&de->q_sem, 0, 0);
++ sem_init(&de->display_start_sem, 0, 0);
++ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
++
++ sem_wait(&de->display_start_sem);
++ if (de->q_terminate) {
++ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
++ return -1;
++ }
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++ return 0;
++}
++
++static void egl_vout_deinit(struct AVFormatContext * s)
++{
++ egl_display_env_t * const de = s->priv_data;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->q_terminate = 1;
++ sem_post(&de->q_sem);
++ pthread_join(de->q_thread, NULL);
++ sem_destroy(&de->q_sem);
++ pthread_mutex_destroy(&de->q_lock);
++
++ av_frame_free(&de->q_next);
++ av_frame_free(&de->q_this);
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++#define OFFSET(x) offsetof(egl_display_env_t, x)
++static const AVOption options[] = {
++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { NULL }
++
++};
++
++static const AVClass egl_vout_class = {
++ .class_name = "egl vid outdev",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_egl_muxer = {
++ .name = "vout_egl",
++ .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"),
++ .priv_data_size = sizeof(egl_display_env_t),
++ .audio_codec = AV_CODEC_ID_NONE,
++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME,
++ .write_header = egl_vout_write_header,
++ .write_packet = egl_vout_write_packet,
++ .write_uncoded_frame = egl_vout_write_frame,
++ .write_trailer = egl_vout_write_trailer,
++ .control_message = egl_vout_control_message,
++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++ .priv_class = &egl_vout_class,
++ .init = egl_vout_init,
++ .deinit = egl_vout_deinit,
++};
++
+--- /dev/null
++++ b/libavdevice/rpi_vout.c
+@@ -0,0 +1,534 @@
++/*
++ * Copyright (c) 2013 Jeff Moguillansky
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * XVideo output device
++ *
++ * TODO:
++ * - add support to more formats
++ */
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include <stdatomic.h>
++#include <unistd.h>
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/mmal_parameters_camera.h>
++#include <interface/mmal/mmal_buffer.h>
++#include <interface/mmal/mmal_port.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++#include <interface/mmal/util/mmal_connection.h>
++#include <interface/mmal/util/mmal_util_params.h>
++#pragma GCC diagnostic pop
++#include "libavutil/rpi_sand_fns.h"
++#include "libavcodec/rpi_zc.h"
++
++#define TRACE_ALL 0
++
++#define DISPLAY_PORT_DEPTH 4
++
++typedef struct rpi_display_env_s
++{
++ AVClass *class;
++
++ MMAL_COMPONENT_T* display;
++ MMAL_COMPONENT_T* isp;
++ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup
++ MMAL_CONNECTION_T * conn;
++
++ MMAL_POOL_T *rpi_pool;
++ volatile int rpi_display_count;
++
++ MMAL_FOURCC_T req_fmt;
++ MMAL_VIDEO_FORMAT_T req_vfmt;
++
++ AVZcEnvPtr zc;
++
++ int window_width, window_height;
++ int window_x, window_y;
++ int layer, fullscreen;
++ int show_all;
++} rpi_display_env_t;
++
++
++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
++ mmal_buffer_header_release(buffer);
++}
++
++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
++ mmal_buffer_header_release(buffer);
++}
++
++
++static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
++{
++ switch (fmt) {
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ return MMAL_ENCODING_YUVUV128;
++ case AV_PIX_FMT_RPI4_10:
++ return MMAL_ENCODING_YUV10_COL;
++ case AV_PIX_FMT_SAND64_10:
++ return MMAL_ENCODING_YUVUV64_10;
++ case AV_PIX_FMT_SAND64_16:
++ return MMAL_ENCODING_YUVUV64_16;
++ case AV_PIX_FMT_YUV420P:
++ return MMAL_ENCODING_I420;
++
++ default:
++ break;
++ }
++ return 0;
++}
++
++
++static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
++ const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
++{
++ MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
++ const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
++ if (av_rpi_is_sand_format(geo->format)) {
++ // Sand formats are a bit "special"
++ // stride1 implicit in format
++ // width = stride2
++ vfmt->width = geo->stripe_is_yc ?
++ geo->height_y + geo->height_c : geo->height_y;
++// es->height = geo->video_height; //*** When we get the FLAG this will change
++ vfmt->height = geo->height_y;
++ es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
++ }
++ else {
++ vfmt->width = geo->stride_y / geo->bytes_per_pel;
++ vfmt->height = geo->height_y;
++ es_fmt->flags = 0;
++ }
++
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = mmfmt_from_avfmt(geo->format);
++ es_fmt->encoding_variant = 0;
++ es_fmt->bitrate = 0;
++
++ vfmt->crop.x = frame->crop_left;
++ vfmt->crop.y = frame->crop_top;
++ vfmt->crop.width = av_frame_cropped_width(frame);
++ vfmt->crop.height = av_frame_cropped_height(frame);
++
++ vfmt->frame_rate.den = 0; // Don't think I know it here
++ vfmt->frame_rate.num = 0;
++
++ vfmt->par.den = frame->sample_aspect_ratio.den;
++ vfmt->par.num = frame->sample_aspect_ratio.num;
++
++ vfmt->color_space = 0; // Unknown currently
++}
++
++static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
++{
++ rpi_display_env_t * const de = userdata;
++ if (buf->user_data != NULL) {
++ av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
++ buf->user_data = NULL;
++ }
++ atomic_fetch_add(&de->rpi_display_count, -1);
++ return MMAL_FALSE;
++}
++
++static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
++{
++ return avfmt == AV_PIX_FMT_SAND64_10;
++}
++
++static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
++{
++ if (de->isp != NULL)
++ {
++ if (de->isp->input[0]->is_enabled)
++ mmal_port_disable(de->isp->input[0]);
++ if (de->isp->control->is_enabled)
++ mmal_port_disable(de->isp->control);
++ }
++ if (de->conn != NULL) {
++ mmal_connection_destroy(de->conn);
++ de->conn = NULL;
++ }
++ if (de->isp != NULL) {
++ mmal_component_destroy(de->isp);
++ de->isp = NULL;
++ }
++}
++
++static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
++{
++ MMAL_BUFFER_HEADER_T* buf = NULL;
++ AVRpiZcRefPtr fr_buf = NULL;
++
++ if (de == NULL)
++ return;
++
++ if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
++ return;
++ }
++
++ if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
++ return;
++ }
++
++ buf = mmal_queue_get(de->rpi_pool->queue);
++ if (!buf) {
++ // Running too fast so drop the frame (unexpected)
++ goto fail;
++ }
++
++ buf->cmd = 0;
++ buf->offset = 0;
++ buf->flags = 0;
++ mmal_buffer_header_reset(buf);
++
++ atomic_fetch_add(&de->rpi_display_count, 1); // Deced on release
++ mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
++
++ buf->user_data = fr_buf;
++ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal
++ buf->offset = av_rpi_zc_offset(fr_buf);
++ buf->length = av_rpi_zc_length(fr_buf);
++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++
++ while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++ usleep(5000);
++ }
++
++ {
++ MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
++ MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
++ MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
++
++ video_format_from_zc_frame(&new_es, fr, fr_buf);
++ if (de->req_fmt != new_es.encoding ||
++ de->req_vfmt.width != new_vfmt->width ||
++ de->req_vfmt.height != new_vfmt->height ||
++ de->req_vfmt.crop.x != new_vfmt->crop.x ||
++ de->req_vfmt.crop.y != new_vfmt->crop.y ||
++ de->req_vfmt.crop.width != new_vfmt->crop.width ||
++ de->req_vfmt.crop.height != new_vfmt->crop.height) {
++ // Something has changed
++
++ // If we have an ISP tear it down
++ isp_remove(s, de);
++ de->port_in = de->display->input[0];
++
++ // If we still need an ISP create it now
++ if (avfmt_needs_isp(fr->format))
++ {
++ if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
++ goto fail;
++ }
++ de->port_in = de->isp->input[0];
++ }
++
++ mmal_format_copy(de->port_in->format, &new_es);
++
++ if (mmal_port_format_commit(de->port_in)) {
++ av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
++ goto fail;
++ }
++
++ // If we have an ISP then we must want to use it
++ if (de->isp != NULL) {
++ MMAL_PORT_T * const port_out = de->isp->output[0];
++ MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
++ MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
++
++ port_out->format->type = MMAL_ES_TYPE_VIDEO;
++ port_out->format->encoding = MMAL_ENCODING_YUVUV128;
++ port_out->format->encoding_variant = 0;
++ port_out->format->bitrate = 0;
++ port_out->format->flags = 0;
++ port_out->format->extradata = NULL;
++ port_out->format->extradata_size = 0;
++
++ vfmt_out->width = (vfmt_in->crop.width + 31) & ~31;
++ vfmt_out->height = (vfmt_in->crop.height + 15) & ~15;
++ vfmt_out->crop.x = 0;
++ vfmt_out->crop.y = 0;
++ vfmt_out->crop.width = vfmt_in->crop.width;
++ vfmt_out->crop.height = vfmt_in->crop.height;
++ vfmt_out->frame_rate = vfmt_in->frame_rate;
++ vfmt_out->par = vfmt_in->par;
++ vfmt_out->color_space = vfmt_in->color_space;
++
++ if (mmal_port_format_commit(port_out)) {
++ av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
++ goto fail;
++ }
++
++ if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
++ av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
++ goto fail;
++ }
++ if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
++ av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
++ goto fail;
++ }
++ mmal_port_enable(de->isp->control,display_cb_control);
++ mmal_component_enable(de->isp);
++ }
++
++ // Number of slots in my port Q
++ de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
++ // Size to keep it happy - isn't used for anything other than error checking
++ de->port_in->buffer_size = buf->alloc_size;
++ if (!de->port_in->is_enabled)
++ {
++ mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image?
++ if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
++ av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
++ goto fail;
++ }
++ }
++
++ de->req_fmt = new_es.encoding;
++ de->req_vfmt = *new_vfmt;
++ }
++ }
++
++ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
++ goto fail;
++ }
++ return;
++
++fail:
++ // If we have a buf then fr_buf is held by that
++ if (buf != NULL)
++ mmal_buffer_header_release(buf);
++ else if (fr_buf != NULL)
++ av_rpi_zc_unref(fr_buf);
++}
++
++
++static int xv_write_trailer(AVFormatContext *s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++ if (de->port_in != NULL && de->port_in->is_enabled) {
++ mmal_port_disable(de->port_in);
++ }
++
++ // The above disable should kick out all buffers - check that
++ if (atomic_load(&de->rpi_display_count) != 0) {
++ av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
++ }
++
++ isp_remove(s, de);
++ if (de->rpi_pool != NULL) {
++ mmal_pool_destroy(de->rpi_pool);
++ de->rpi_pool = NULL;
++ }
++ if (de->display != NULL) {
++ mmal_component_destroy(de->display);
++ de->display = NULL;
++ }
++
++ return 0;
++}
++
++static int xv_write_header(AVFormatContext *s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++ const AVCodecParameters * const par = s->streams[0]->codecpar;
++ const unsigned int w = de->window_width ? de->window_width : par->width;
++ const unsigned int h = de->window_height ? de->window_height : par->height;
++ const unsigned int x = de->window_x;
++ const unsigned int y = de->window_y;
++ const int layer = de->layer ? de->layer : 2;
++ const MMAL_BOOL_T fullscreen = de->fullscreen;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
++#endif
++ if ( s->nb_streams > 1
++ || par->codec_type != AVMEDIA_TYPE_VIDEO
++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) {
++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++ return AVERROR(EINVAL);
++ }
++
++ {
++ MMAL_DISPLAYREGION_T region =
++ {
++ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
++ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
++ MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
++ .layer = layer,
++ .fullscreen = fullscreen,
++ .dest_rect = {x, y, w, h},
++ .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
++ };
++
++ bcm_host_init(); // Needs to be done by someone...
++
++ if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
++ goto fail;
++ }
++ de->port_in = de->display->input[0];
++
++ mmal_port_parameter_set(de->display->input[0], &region.hdr);
++
++ if (mmal_component_enable(de->display) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
++ goto fail;
++ }
++ if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
++ goto fail;
++ }
++
++ if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
++ goto fail;
++ }
++ }
++
++ return 0;
++
++fail:
++ xv_write_trailer(s);
++ return AVERROR_UNKNOWN;
++}
++
++static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++ AVFrame * const frame = (AVFrame *)pkt->data;
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++ display_frame(s, s->priv_data, frame);
++ return 0;
++}
++
++static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++ unsigned flags)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++ /* xv_write_header() should have accepted only supported formats */
++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++ return 0;
++// return write_picture(s, (*frame)->data, (*frame)->linesize);
++
++ display_frame(s, s->priv_data, *ppframe);
++ return 0;
++}
++
++static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++ switch(type) {
++ case AV_APP_TO_DEV_WINDOW_REPAINT:
++ return 0;
++ default:
++ break;
++ }
++ return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int rpi_vout_init(struct AVFormatContext * s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++
++ // Get a ZC context in case we need one - has little overhead if unused
++ if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
++ return 1;
++
++ return 0;
++}
++
++static void rpi_vout_deinit(struct AVFormatContext * s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++
++ av_rpi_zc_int_env_freep(&de->zc);
++}
++
++
++#define OFFSET(x) offsetof(rpi_display_env_t, x)
++static const AVOption options[] = {
++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "display_layer","set display layer", OFFSET(layer), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { NULL }
++
++};
++
++static const AVClass xv_class = {
++ .class_name = "rpi vid outdev",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_rpi_muxer = {
++ .name = "vout_rpi",
++ .long_name = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
++ .priv_data_size = sizeof(rpi_display_env_t),
++ .audio_codec = AV_CODEC_ID_NONE,
++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME,
++ .write_header = xv_write_header,
++ .write_packet = xv_write_packet,
++ .write_uncoded_frame = xv_write_frame,
++ .write_trailer = xv_write_trailer,
++ .control_message = xv_control_message,
++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++ .priv_class = &xv_class,
++ .init = rpi_vout_init,
++ .deinit = rpi_vout_deinit,
++};
+--- a/libavfilter/Makefile
++++ b/libavfilter/Makefile
+@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)
+ OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o
+ OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o
+ OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o
++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o
+ OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o
+ OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o
+ OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o
+@@ -434,6 +435,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)
+ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o
+ OBJS-$(CONFIG_TRIM_FILTER) += trim.o
+ OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o
++OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o
+ OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o
+ OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \
+ opencl/unsharp.o
+--- a/libavfilter/allfilters.c
++++ b/libavfilter/allfilters.c
+@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot;
+ extern AVFilter ff_vf_deflate;
+ extern AVFilter ff_vf_deflicker;
+ extern AVFilter ff_vf_deinterlace_qsv;
++extern AVFilter ff_vf_deinterlace_v4l2m2m;
+ extern AVFilter ff_vf_deinterlace_vaapi;
+ extern AVFilter ff_vf_dejudder;
+ extern AVFilter ff_vf_delogo;
+@@ -414,6 +415,7 @@ extern AVFilter ff_vf_transpose_opencl;
+ extern AVFilter ff_vf_transpose_vaapi;
+ extern AVFilter ff_vf_trim;
+ extern AVFilter ff_vf_unpremultiply;
++extern AVFilter ff_vf_unsand;
+ extern AVFilter ff_vf_unsharp;
+ extern AVFilter ff_vf_unsharp_opencl;
+ extern AVFilter ff_vf_untile;
+--- a/libavfilter/avfiltergraph.c
++++ b/libavfilter/avfiltergraph.c
+@@ -32,6 +32,9 @@
+ #include "libavutil/internal.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/pixdesc.h"
++#if CONFIG_UNSAND_FILTER
++#include "libavutil/rpi_sand_fns.h"
++#endif
+
+ #define FF_INTERNAL_FIELDS 1
+ #include "framequeue.h"
+@@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFor
+ }
+ }
+
++#if CONFIG_UNSAND_FILTER
++static int has_sand_format(const AVFilterFormats * const ff)
++{
++ int i;
++ for (i = 0; i != ff->nb_formats; ++i) {
++ if (av_rpi_is_sand_format(ff->formats[i])) {
++ return 1;
++ }
++ }
++ return 0;
++}
++#endif
++
+ /**
+ * Perform one round of query_formats() and merging formats lists on the
+ * filter graph.
+@@ -467,6 +483,7 @@ static int query_formats(AVFilterGraph *
+ for (j = 0; j < filter->nb_inputs; j++) {
+ AVFilterLink *link = filter->inputs[j];
+ int convert_needed = 0;
++ unsigned int extra_convert_tried = 0;
+
+ if (!link)
+ continue;
+@@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *
+ )
+ #undef MERGE_DISPATCH
+
+- if (convert_needed) {
++ while (convert_needed) {
+ AVFilterContext *convert;
+ const AVFilter *filter;
+ AVFilterLink *inlink, *outlink;
+ char inst_name[30];
++ int can_retry = 0;
++
++ convert_needed = 0;
+
+ if (graph->disable_auto_convert) {
+ av_log(log_ctx, AV_LOG_ERROR,
+@@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *
+ /* couldn't merge format lists. auto-insert conversion filter */
+ switch (link->type) {
+ case AVMEDIA_TYPE_VIDEO:
+- if (!(filter = avfilter_get_by_name("scale"))) {
+- av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
+- "not present, cannot convert pixel formats.\n");
+- return AVERROR(EINVAL);
+- }
+-
+- snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
+- scaler_count++);
++#if CONFIG_UNSAND_FILTER
++ // Only try each extra conversion once
++ // The unsand output pad should never trigger has_sand_format
++ // but it is better to be safe
++ if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
++ if (!(filter = avfilter_get_by_name("unsand"))) {
++ av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
++ "not present, cannot convert pixel formats.\n");
++ return AVERROR(EINVAL);
++ }
++
++ snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
++ scaler_count++);
++
++ if ((ret = avfilter_graph_create_filter(&convert, filter,
++ inst_name, "", NULL,
++ graph)) < 0)
++ return ret;
+
+- if ((ret = avfilter_graph_create_filter(&convert, filter,
+- inst_name, graph->scale_sws_opts, NULL,
+- graph)) < 0)
+- return ret;
++ extra_convert_tried |= 1;
++ can_retry = 1;
++ }
++ else
++#endif
++ {
++ if (!(filter = avfilter_get_by_name("scale"))) {
++ av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
++ "not present, cannot convert pixel formats.\n");
++ return AVERROR(EINVAL);
++ }
++
++ snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
++ scaler_count++);
++
++ if ((ret = avfilter_graph_create_filter(&convert, filter,
++ inst_name, graph->scale_sws_opts, NULL,
++ graph)) < 0)
++ return ret;
++ }
+ break;
+ case AVMEDIA_TYPE_AUDIO:
+ if (!(filter = avfilter_get_by_name("aresample"))) {
+@@ -585,9 +631,19 @@ static int query_formats(AVFilterGraph *
+ av_assert0(outlink-> in_channel_layouts->refcount > 0);
+ av_assert0(outlink->out_channel_layouts->refcount > 0);
+ }
+- if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type) ||
+- !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++ // If we have added an extra filter we must merge the input
++ // side but we can have another go at the output
++ if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type))
+ ret = AVERROR(ENOSYS);
++ else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++ {
++ if (can_retry) {
++ link = outlink;
++ convert_needed = 1;
++ continue;
++ }
++ ret = AVERROR(ENOSYS);
++ }
+ if (inlink->type == AVMEDIA_TYPE_AUDIO &&
+ (!ff_merge_samplerates(inlink->in_samplerates,
+ inlink->out_samplerates) ||
+--- a/libavfilter/buffersrc.c
++++ b/libavfilter/buffersrc.c
+@@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_intern
+
+ switch (ctx->outputs[0]->type) {
+ case AVMEDIA_TYPE_VIDEO:
+- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
++ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
+ frame->format, frame->pts);
+ break;
+ case AVMEDIA_TYPE_AUDIO:
+--- /dev/null
++++ b/libavfilter/vf_deinterlace_v4l2m2m.c
+@@ -0,0 +1,1336 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * deinterlace video filter - V4L2 M2M
++ */
++
++#include <drm_fourcc.h>
++
++#include <linux/videodev2.h>
++
++#include <dirent.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <stdatomic.h>
++#include <stdio.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/avstring.h"
++#include "libavutil/common.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavutil/internal.h"
++#include "libavutil/mathematics.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/time.h"
++
++#define FF_INTERNAL_FIELDS 1
++#include "framequeue.h"
++#include "filters.h"
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct V4L2Queue V4L2Queue;
++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
++
++typedef struct V4L2PlaneInfo {
++ int bytesperline;
++ size_t length;
++} V4L2PlaneInfo;
++
++typedef struct V4L2Buffer {
++ int enqueued;
++ int reenqueue;
++ int fd;
++ struct v4l2_buffer buffer;
++ AVFrame frame;
++ struct v4l2_plane planes[VIDEO_MAX_PLANES];
++ int num_planes;
++ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
++ AVDRMFrameDescriptor drm_frame;
++ V4L2Queue *q;
++} V4L2Buffer;
++
++typedef struct V4L2Queue {
++ struct v4l2_format format;
++ int num_buffers;
++ V4L2Buffer *buffers;
++ DeintV4L2M2MContextShared *ctx;
++} V4L2Queue;
++
++typedef struct pts_stats_s
++{
++ void * logctx;
++ const char * name; // For debug
++ unsigned int last_count;
++ unsigned int last_interval;
++ int64_t last_pts;
++} pts_stats_t;
++
++#define PTS_TRACK_SIZE 32
++typedef struct pts_track_el_s
++{
++ uint32_t n;
++ unsigned int interval;
++ AVFrame * props;
++} pts_track_el_t;
++
++typedef struct pts_track_s
++{
++ uint32_t n;
++ uint32_t last_n;
++ int got_2;
++ void * logctx;
++ pts_stats_t stats;
++ pts_track_el_t a[PTS_TRACK_SIZE];
++} pts_track_t;
++
++typedef struct DeintV4L2M2MContextShared {
++ void * logctx; // For logging - will be NULL when done
++
++ int fd;
++ int done;
++ int width;
++ int height;
++ int orig_width;
++ int orig_height;
++ atomic_uint refcount;
++
++ AVBufferRef *hw_frames_ctx;
++
++ unsigned int field_order;
++
++ pts_track_t track;
++
++ V4L2Queue output;
++ V4L2Queue capture;
++} DeintV4L2M2MContextShared;
++
++typedef struct DeintV4L2M2MContext {
++ const AVClass *class;
++
++ DeintV4L2M2MContextShared *shared;
++} DeintV4L2M2MContext;
++
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++ return stats->last_interval;
++}
++
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++ if (stats->last_count < STATS_LAST_COUNT_MAX)
++ ++stats->last_count;
++ return;
++ }
++
++ if (stats->last_pts != AV_NOPTS_VALUE) {
++ const int64_t interval = pts - stats->last_pts;
++
++ if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++ stats->last_count >= STATS_LAST_COUNT_MAX) {
++ if (stats->last_interval != 0)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++ __func__, stats->name, interval, stats->last_count);
++ stats->last_interval = 0;
++ }
++ else {
++ const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++ if (frame_time != stats->last_interval)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++ stats->last_interval = frame_time;
++ }
++ }
++
++ stats->last_pts = pts;
++ stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++ *stats = (pts_stats_t){
++ .logctx = logctx,
++ .name = name,
++ .last_count = 1,
++ .last_interval = 0,
++ .last_pts = AV_NOPTS_VALUE
++ };
++}
++
++static inline uint32_t pts_track_next_n(pts_track_t * const trk)
++{
++ if (++trk->n == 0)
++ trk->n = 1;
++ return trk->n;
++}
++
++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
++{
++ uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
++ pts_track_el_t * t;
++
++ // As a first guess assume that n==0 means last frame
++ if (n == 0) {
++ n = trk->last_n;
++ if (n == 0)
++ goto fail;
++ }
++
++ t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++ if (t->n != n) {
++ av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
++ goto fail;
++ }
++
++ // 1st frame is simple - just believe it
++ if (n != trk->last_n) {
++ trk->last_n = n;
++ trk->got_2 = 0;
++ return av_frame_copy_props(dst, t->props);
++ }
++
++ // Only believe in a single interpolated frame
++ if (trk->got_2)
++ goto fail;
++ trk->got_2 = 1;
++
++ av_frame_copy_props(dst, t->props);
++
++
++ // If we can't guess - don't
++ if (t->interval == 0) {
++ dst->best_effort_timestamp = AV_NOPTS_VALUE;
++ dst->pts = AV_NOPTS_VALUE;
++ dst->pkt_dts = AV_NOPTS_VALUE;
++ }
++ else {
++ if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
++ dst->best_effort_timestamp += t->interval / 2;
++ if (dst->pts != AV_NOPTS_VALUE)
++ dst->pts += t->interval / 2;
++ if (dst->pkt_dts != AV_NOPTS_VALUE)
++ dst->pkt_dts += t->interval / 2;
++ }
++
++ return 0;
++
++fail:
++ trk->last_n = 0;
++ trk->got_2 = 0;
++ dst->pts = AV_NOPTS_VALUE;
++ dst->pkt_dts = AV_NOPTS_VALUE;
++ return 0;
++}
++
++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
++{
++ const uint32_t n = pts_track_next_n(trk);
++ pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++ pts_stats_add(&trk->stats, src->pts);
++
++ t->n = n;
++ t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
++ av_frame_unref(t->props);
++ av_frame_copy_props(t->props, src);
++
++ // We now know what the previous interval was, rather than having to guess,
++ // so set it. There is a better than decent chance that this is before
++ // we use it.
++ if (t->interval != 0) {
++ pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
++ prev_t->interval = t->interval;
++ }
++
++ // In case deinterlace interpolates frames use every other usec
++ return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
++}
++
++static void pts_track_uninit(pts_track_t * const trk)
++{
++ unsigned int i;
++ for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++ trk->a[i].n = 0;
++ av_frame_free(&trk->a[i].props);
++ }
++}
++
++static int pts_track_init(pts_track_t * const trk, void *logctx)
++{
++ unsigned int i;
++ trk->n = 1;
++ pts_stats_init(&trk->stats, logctx, "track");
++ for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++ trk->a[i].n = 0;
++ if ((trk->a[i].props = av_frame_alloc()) == NULL) {
++ pts_track_uninit(trk);
++ return AVERROR(ENOMEM);
++ }
++ }
++ return 0;
++}
++
++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
++{
++ struct v4l2_capability cap;
++ int ret;
++
++ memset(&cap, 0, sizeof(cap));
++ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
++ if (ret < 0)
++ return ret;
++
++ if (!(cap.capabilities & V4L2_CAP_STREAMING))
++ return AVERROR(EINVAL);
++
++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++
++ return 0;
++ }
++
++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++
++ return 0;
++ }
++
++ return AVERROR(EINVAL);
++}
++
++static int deint_v4l2m2m_try_format(V4L2Queue *queue)
++{
++ struct v4l2_format *fmt = &queue->format;
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ int ret, field;
++
++ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
++ if (ret)
++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
++
++ if (V4L2_TYPE_IS_OUTPUT(fmt->type))
++ field = V4L2_FIELD_INTERLACED_TB;
++ else
++ field = V4L2_FIELD_NONE;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
++ fmt->fmt.pix_mp.field = field;
++ fmt->fmt.pix_mp.width = ctx->width;
++ fmt->fmt.pix_mp.height = ctx->height;
++ } else {
++ fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
++ fmt->fmt.pix.field = field;
++ fmt->fmt.pix.width = ctx->width;
++ fmt->fmt.pix.height = ctx->height;
++ }
++
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
++ fmt->fmt.pix_mp.pixelformat,
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
++
++ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
++ if (ret)
++ return AVERROR(EINVAL);
++
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
++ fmt->fmt.pix_mp.pixelformat,
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
++ fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
++ fmt->fmt.pix_mp.field != field) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
++
++ return AVERROR(EINVAL);
++ }
++ } else {
++ if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
++ fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
++ fmt->fmt.pix.field != field) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
++
++ return AVERROR(EINVAL);
++ }
++ }
++
++ return 0;
++}
++
++static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
++{
++ struct v4l2_format *fmt = &queue->format;
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ int ret;
++
++ struct v4l2_selection sel = {
++ .type = fmt->type,
++ .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
++ };
++
++ // This works for most single object 4:2:0 types
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ fmt->fmt.pix_mp.pixelformat = pixelformat;
++ fmt->fmt.pix_mp.field = field;
++ fmt->fmt.pix_mp.width = width;
++ fmt->fmt.pix_mp.height = ysize / pitch;
++ fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
++ } else {
++ fmt->fmt.pix.pixelformat = pixelformat;
++ fmt->fmt.pix.field = field;
++ fmt->fmt.pix.width = width;
++ fmt->fmt.pix.height = height;
++ fmt->fmt.pix.sizeimage = 0;
++ fmt->fmt.pix.bytesperline = 0;
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
++ return ret;
++ }
++
++ if (pixelformat != fmt->fmt.pix.pixelformat) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
++ return AVERROR(EINVAL);
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
++ }
++
++ sel.r.width = width;
++ sel.r.height = height;
++ sel.r.left = 0;
++ sel.r.top = 0;
++ sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
++ sel.flags = V4L2_SEL_FLAG_LE;
++
++ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
++ }
++
++ return 0;
++}
++
++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
++{
++ int ret;
++
++ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
++ if (ctx->fd < 0)
++ return AVERROR(errno);
++
++ ret = deint_v4l2m2m_prepare_context(ctx);
++ if (ret)
++ goto fail;
++
++ ret = deint_v4l2m2m_try_format(&ctx->capture);
++ if (ret)
++ goto fail;
++
++ ret = deint_v4l2m2m_try_format(&ctx->output);
++ if (ret)
++ goto fail;
++
++ return 0;
++
++fail:
++ close(ctx->fd);
++ ctx->fd = -1;
++
++ return ret;
++}
++
++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
++{
++ int ret = AVERROR(EINVAL);
++ struct dirent *entry;
++ char node[PATH_MAX];
++ DIR *dirp;
++
++ dirp = opendir("/dev");
++ if (!dirp)
++ return AVERROR(errno);
++
++ for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
++
++ if (strncmp(entry->d_name, "video", 5))
++ continue;
++
++ snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
++ av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
++ ret = deint_v4l2m2m_probe_device(ctx, node);
++ if (!ret)
++ break;
++ }
++
++ closedir(dirp);
++
++ if (ret) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
++ ctx->fd = -1;
++
++ return ret;
++ }
++
++ av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
++
++ return 0;
++}
++
++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
++{
++ int ret;
++
++ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
++ if (ret < 0)
++ return AVERROR(errno);
++
++ buf->enqueued = 1;
++
++ return 0;
++}
++
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
++{
++ struct v4l2_exportbuffer expbuf;
++ int i, ret;
++ uint64_t mod = DRM_FORMAT_MOD_LINEAR;
++ uint32_t fmt = 0;
++
++ switch (pixelformat) {
++ case V4L2_PIX_FMT_NV12:
++ fmt = DRM_FORMAT_NV12;
++ break;
++ case V4L2_PIX_FMT_YUV420:
++ fmt = DRM_FORMAT_YUV420;
++ break;
++ default:
++ return AVERROR(EINVAL);
++ }
++
++ avbuf->drm_frame.layers[0].format = fmt;
++
++ for (i = 0; i < avbuf->num_planes; i++) {
++ memset(&expbuf, 0, sizeof(expbuf));
++
++ expbuf.index = avbuf->buffer.index;
++ expbuf.type = avbuf->buffer.type;
++ expbuf.plane = i;
++
++ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
++ if (ret < 0)
++ return AVERROR(errno);
++
++ avbuf->fd = expbuf.fd;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
++ /* drm frame */
++ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
++ avbuf->drm_frame.objects[i].fd = expbuf.fd;
++ avbuf->drm_frame.objects[i].format_modifier = mod;
++ } else {
++ /* drm frame */
++ avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
++ avbuf->drm_frame.objects[0].fd = expbuf.fd;
++ avbuf->drm_frame.objects[0].format_modifier = mod;
++ }
++ }
++
++ return 0;
++}
++
++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
++{
++ struct v4l2_format *fmt = &queue->format;
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ struct v4l2_requestbuffers req;
++ int ret, i, j, multiplanar;
++ uint32_t memory;
++
++ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
++ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
++
++ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
++
++ memset(&req, 0, sizeof(req));
++ req.count = queue->num_buffers;
++ req.memory = memory;
++ req.type = fmt->type;
++
++ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
++ if (ret < 0) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
++
++ return AVERROR(errno);
++ }
++
++ queue->num_buffers = req.count;
++ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
++ if (!queue->buffers) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
++
++ return AVERROR(ENOMEM);
++ }
++
++ for (i = 0; i < queue->num_buffers; i++) {
++ V4L2Buffer *buf = &queue->buffers[i];
++
++ buf->enqueued = 0;
++ buf->fd = -1;
++ buf->q = queue;
++
++ buf->buffer.type = fmt->type;
++ buf->buffer.memory = memory;
++ buf->buffer.index = i;
++
++ if (multiplanar) {
++ buf->buffer.length = VIDEO_MAX_PLANES;
++ buf->buffer.m.planes = buf->planes;
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
++ if (ret < 0) {
++ ret = AVERROR(errno);
++
++ goto fail;
++ }
++
++ if (multiplanar)
++ buf->num_planes = buf->buffer.length;
++ else
++ buf->num_planes = 1;
++
++ for (j = 0; j < buf->num_planes; j++) {
++ V4L2PlaneInfo *info = &buf->plane_info[j];
++
++ if (multiplanar) {
++ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
++ info->length = buf->buffer.m.planes[j].length;
++ } else {
++ info->bytesperline = fmt->fmt.pix.bytesperline;
++ info->length = buf->buffer.length;
++ }
++ }
++
++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
++ ret = deint_v4l2m2m_enqueue_buffer(buf);
++ if (ret)
++ goto fail;
++
++ ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
++ if (ret)
++ goto fail;
++ }
++ }
++
++ return 0;
++
++fail:
++ for (i = 0; i < queue->num_buffers; i++)
++ if (queue->buffers[i].fd >= 0)
++ close(queue->buffers[i].fd);
++ av_free(queue->buffers);
++ queue->buffers = NULL;
++
++ return ret;
++}
++
++static int deint_v4l2m2m_streamon(V4L2Queue *queue)
++{
++ DeintV4L2M2MContextShared * const ctx = queue->ctx;
++ int type = queue->format.type;
++ int ret;
++
++ ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++ if (ret < 0)
++ return AVERROR(errno);
++
++ return 0;
++}
++
++static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
++{
++ DeintV4L2M2MContextShared * const ctx = queue->ctx;
++ int type = queue->format.type;
++ int ret;
++
++ ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++ if (ret < 0)
++ return AVERROR(errno);
++
++ return 0;
++}
++
++// timeout in ms
++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
++{
++ struct v4l2_plane planes[VIDEO_MAX_PLANES];
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ struct v4l2_buffer buf = { 0 };
++ V4L2Buffer* avbuf = NULL;
++ struct pollfd pfd;
++ short events;
++ int ret;
++
++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++ events = POLLOUT | POLLWRNORM;
++ else
++ events = POLLIN | POLLRDNORM;
++
++ pfd.events = events;
++ pfd.fd = ctx->fd;
++
++ for (;;) {
++ ret = poll(&pfd, 1, timeout);
++ if (ret > 0)
++ break;
++ if (errno == EINTR)
++ continue;
++ return NULL;
++ }
++
++ if (pfd.revents & POLLERR)
++ return NULL;
++
++ if (pfd.revents & events) {
++ memset(&buf, 0, sizeof(buf));
++ buf.memory = V4L2_MEMORY_MMAP;
++ buf.type = queue->format.type;
++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++ memset(planes, 0, sizeof(planes));
++ buf.length = VIDEO_MAX_PLANES;
++ buf.m.planes = planes;
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
++ if (ret) {
++ if (errno != EAGAIN)
++ av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
++ av_err2str(AVERROR(errno)));
++ return NULL;
++ }
++
++ avbuf = &queue->buffers[buf.index];
++ avbuf->enqueued = 0;
++ avbuf->buffer = buf;
++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++ memcpy(avbuf->planes, planes, sizeof(planes));
++ avbuf->buffer.m.planes = avbuf->planes;
++ }
++ return avbuf;
++ }
++
++ return NULL;
++}
++
++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
++{
++ int i;
++ V4L2Buffer *buf = NULL;
++
++ for (i = 0; i < queue->num_buffers; i++)
++ if (!queue->buffers[i].enqueued) {
++ buf = &queue->buffers[i];
++ break;
++ }
++ return buf;
++}
++
++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
++{
++ int i;
++ V4L2Buffer *buf = NULL;
++
++ if (!queue || !queue->buffers)
++ return;
++ for (i = 0; i < queue->num_buffers; i++) {
++ buf = &queue->buffers[i];
++ if (queue->buffers[i].enqueued)
++ av_frame_unref(&buf->frame);
++ }
++}
++
++static void recycle_q(V4L2Queue * const queue)
++{
++ V4L2Buffer* avbuf;
++ while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
++ av_frame_unref(&avbuf->frame);
++ }
++}
++
++static int count_enqueued(V4L2Queue *queue)
++{
++ int i;
++ int n = 0;
++
++ if (queue->buffers == NULL)
++ return 0;
++
++ for (i = 0; i < queue->num_buffers; i++)
++ if (queue->buffers[i].enqueued)
++ ++n;
++ return n;
++}
++
++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
++{
++ DeintV4L2M2MContextShared *const ctx = queue->ctx;
++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
++ V4L2Buffer *buf;
++ int i;
++
++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++ recycle_q(queue);
++
++ buf = deint_v4l2m2m_find_free_buf(queue);
++ if (!buf) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
++ return AVERROR(EAGAIN);
++ }
++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
++ for (i = 0; i < drm_desc->nb_objects; i++)
++ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
++ else
++ buf->buffer.m.fd = drm_desc->objects[0].fd;
++
++ buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
++ frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
++ V4L2_FIELD_INTERLACED_BT;
++
++ if (ctx->field_order != buf->buffer.field) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
++ ctx->field_order = buf->buffer.field;
++ }
++
++ buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
++
++ buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
++
++ av_frame_move_ref(&buf->frame, frame);
++
++ return deint_v4l2m2m_enqueue_buffer(buf);
++}
++
++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
++{
++ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
++ V4L2Queue *capture = &ctx->capture;
++ V4L2Queue *output = &ctx->output;
++ int i;
++
++ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
++
++ if (ctx->fd >= 0) {
++ deint_v4l2m2m_streamoff(capture);
++ deint_v4l2m2m_streamoff(output);
++ }
++
++ if (capture->buffers)
++ for (i = 0; i < capture->num_buffers; i++) {
++ capture->buffers[i].q = NULL;
++ if (capture->buffers[i].fd >= 0)
++ close(capture->buffers[i].fd);
++ }
++
++ deint_v4l2m2m_unref_queued(output);
++
++ av_buffer_unref(&ctx->hw_frames_ctx);
++
++ if (capture->buffers)
++ av_free(capture->buffers);
++
++ if (output->buffers)
++ av_free(output->buffers);
++
++ if (ctx->fd >= 0) {
++ close(ctx->fd);
++ ctx->fd = -1;
++ }
++
++ av_free(ctx);
++ }
++}
++
++static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++{
++ V4L2Buffer *buf = opaque;
++ DeintV4L2M2MContextShared *ctx = buf->q->ctx;
++
++ if (!ctx->done)
++ deint_v4l2m2m_enqueue_buffer(buf);
++
++ deint_v4l2m2m_destroy_context(ctx);
++}
++
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
++{
++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++ AVDRMLayerDescriptor *layer;
++
++ /* fill the DRM frame descriptor */
++ drm_desc->nb_objects = avbuf->num_planes;
++ drm_desc->nb_layers = 1;
++
++ layer = &drm_desc->layers[0];
++ layer->nb_planes = avbuf->num_planes;
++
++ for (int i = 0; i < avbuf->num_planes; i++) {
++ layer->planes[i].object_index = i;
++ layer->planes[i].offset = 0;
++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
++ }
++
++ switch (layer->format) {
++ case DRM_FORMAT_YUYV:
++ layer->nb_planes = 1;
++ break;
++
++ case DRM_FORMAT_NV12:
++ case DRM_FORMAT_NV21:
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 2;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++ break;
++
++ case DRM_FORMAT_YUV420:
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 3;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++ layer->planes[2].object_index = 0;
++ layer->planes[2].offset = layer->planes[1].offset +
++ ((avbuf->plane_info[0].bytesperline *
++ height) >> 2);
++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++ break;
++
++ default:
++ drm_desc->nb_layers = 0;
++ break;
++ }
++
++ return (uint8_t *) drm_desc;
++}
++
++// timeout in ms
++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
++{
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ V4L2Buffer* avbuf;
++
++ av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
++ if (!avbuf) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
++ return AVERROR(EAGAIN);
++ }
++
++ // Fill in PTS and anciliary info from src frame
++ // we will want to overwrite some fields as only the pts/dts
++ // fields are updated with new timing in this fn
++ pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
++
++ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
++ sizeof(avbuf->drm_frame), v4l2_free_buffer,
++ avbuf, AV_BUFFER_FLAG_READONLY);
++ if (!frame->buf[0]) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
++ return AVERROR(ENOMEM);
++ }
++
++ atomic_fetch_add(&ctx->refcount, 1);
++
++ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ if (ctx->hw_frames_ctx)
++ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
++ frame->height = ctx->height;
++ frame->width = ctx->width;
++
++ // Not interlaced now
++ frame->interlaced_frame = 0;
++ frame->top_field_first = 0;
++ // Pkt duration halved
++ frame->pkt_duration /= 2;
++
++ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
++ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
++ }
++
++ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
++ return 0;
++}
++
++static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
++{
++ AVFilterLink *inlink = outlink->src->inputs[0];
++ AVFilterContext *avctx = outlink->src;
++ DeintV4L2M2MContext *priv = avctx->priv;
++ DeintV4L2M2MContextShared *ctx = priv->shared;
++ int ret;
++
++ ctx->height = avctx->inputs[0]->h;
++ ctx->width = avctx->inputs[0]->w;
++
++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
++
++ outlink->time_base = inlink->time_base;
++ outlink->w = inlink->w;
++ outlink->h = inlink->h;
++ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
++ outlink->format = inlink->format;
++ outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate
++
++ ret = deint_v4l2m2m_find_device(ctx);
++ if (ret)
++ return ret;
++
++ if (inlink->hw_frames_ctx) {
++ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
++ if (!ctx->hw_frames_ctx)
++ return AVERROR(ENOMEM);
++ }
++ return 0;
++}
++
++static int deint_v4l2m2m_query_formats(AVFilterContext *avctx)
++{
++ static const enum AVPixelFormat pixel_formats[] = {
++ AV_PIX_FMT_DRM_PRIME,
++ AV_PIX_FMT_YUV420P,
++ AV_PIX_FMT_NONE,
++ };
++
++ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats));
++}
++
++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
++{
++ const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
++ drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
++
++ switch (drm_desc->layers[0].format) {
++ case DRM_FORMAT_YUV420:
++ if (is_linear)
++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
++ break;
++ case DRM_FORMAT_NV12:
++ if (is_linear)
++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
++ break;
++ default:
++ break;
++ }
++ return 0;
++}
++
++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
++{
++ AVFilterContext *avctx = link->dst;
++ DeintV4L2M2MContext *priv = avctx->priv;
++ DeintV4L2M2MContextShared *ctx = priv->shared;
++ V4L2Queue *capture = &ctx->capture;
++ V4L2Queue *output = &ctx->output;
++ int ret;
++
++ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
++ __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
++ av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
++ avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
++
++ if (ctx->field_order == V4L2_FIELD_ANY) {
++ const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
++ const uint32_t pixelformat = desc_pixelformat(drm_desc);
++
++ if (pixelformat == 0) {
++ av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
++ av_fourcc2str(drm_desc->layers[0].format),
++ drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
++ return AVERROR(EINVAL);
++ }
++
++ ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
++ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
++
++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
++ drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
++
++ ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_allocate_buffers(capture);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_streamon(capture);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_allocate_buffers(output);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_streamon(output);
++ if (ret)
++ return ret;
++
++ if (in->top_field_first)
++ ctx->field_order = V4L2_FIELD_INTERLACED_TB;
++ else
++ ctx->field_order = V4L2_FIELD_INTERLACED_BT;
++
++ }
++
++ ret = deint_v4l2m2m_enqueue_frame(output, in);
++
++ av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
++ return ret;
++}
++
++static int deint_v4l2m2m_activate(AVFilterContext *avctx)
++{
++ DeintV4L2M2MContext * const priv = avctx->priv;
++ DeintV4L2M2MContextShared *const s = priv->shared;
++ AVFilterLink * const outlink = avctx->outputs[0];
++ AVFilterLink * const inlink = avctx->inputs[0];
++ int n = 0;
++ int cn = 99;
++ int instatus = 0;
++ int64_t inpts = 0;
++ int did_something = 0;
++
++ av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++ FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
++
++ ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
++
++ if (!ff_outlink_frame_wanted(outlink)) {
++ av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
++ }
++ else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup!
++ {
++ AVFrame * frame = av_frame_alloc();
++ int rv;
++
++again:
++ recycle_q(&s->output);
++ n = count_enqueued(&s->output);
++
++ if (frame == NULL) {
++ av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
++ return AVERROR(ENOMEM);
++ }
++
++ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
++ if (rv != 0) {
++ av_frame_free(&frame);
++ if (rv != AVERROR(EAGAIN)) {
++ av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
++ return rv;
++ }
++ }
++ else {
++ frame->interlaced_frame = 0;
++ // frame is always consumed by filter_frame - even on error despite
++ // a somewhat confusing comment in the header
++ rv = ff_filter_frame(outlink, frame);
++
++ if (instatus != 0) {
++ av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
++ goto again;
++ }
++
++ av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
++ did_something = 1;
++ }
++
++ cn = count_enqueued(&s->capture);
++ }
++
++ if (instatus != 0) {
++ ff_outlink_set_status(outlink, instatus, inpts);
++ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
++ return 0;
++ }
++
++ recycle_q(&s->output);
++ n = count_enqueued(&s->output);
++
++ while (n < 6) {
++ AVFrame * frame;
++ int rv;
++
++ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
++ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
++ return rv;
++ }
++
++ if (frame == NULL) {
++ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
++ break;
++ }
++
++ rv = deint_v4l2m2m_filter_frame(inlink, frame);
++ av_frame_free(&frame);
++
++ if (rv != 0)
++ return rv;
++
++ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
++ ++n;
++ }
++
++ if (n < 6) {
++ ff_inlink_request_frame(inlink);
++ did_something = 1;
++ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
++ }
++
++ if (n > 4 && ff_outlink_frame_wanted(outlink)) {
++ ff_filter_set_ready(avctx, 1);
++ did_something = 1;
++ av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
++ }
++
++ av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
++ return did_something ? 0 : FFERROR_NOT_READY;
++}
++
++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
++{
++ DeintV4L2M2MContext * const priv = avctx->priv;
++ DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
++
++ if (!ctx) {
++ av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
++ return AVERROR(ENOMEM);
++ }
++ priv->shared = ctx;
++ ctx->logctx = priv;
++ ctx->fd = -1;
++ ctx->output.ctx = ctx;
++ ctx->output.num_buffers = 8;
++ ctx->capture.ctx = ctx;
++ ctx->capture.num_buffers = 12;
++ ctx->done = 0;
++ ctx->field_order = V4L2_FIELD_ANY;
++
++ pts_track_init(&ctx->track, priv);
++
++ atomic_init(&ctx->refcount, 1);
++
++ return 0;
++}
++
++static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
++{
++ DeintV4L2M2MContext *priv = avctx->priv;
++ DeintV4L2M2MContextShared *ctx = priv->shared;
++
++ ctx->done = 1;
++ ctx->logctx = NULL; // Log to NULL works, log to missing crashes
++ pts_track_uninit(&ctx->track);
++ deint_v4l2m2m_destroy_context(ctx);
++}
++
++static const AVOption deinterlace_v4l2m2m_options[] = {
++ { NULL },
++};
++
++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
++
++static const AVFilterPad deint_v4l2m2m_inputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO,
++ },
++ { NULL }
++};
++
++static const AVFilterPad deint_v4l2m2m_outputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .config_props = deint_v4l2m2m_config_props,
++ },
++ { NULL }
++};
++
++AVFilter ff_vf_deinterlace_v4l2m2m = {
++ .name = "deinterlace_v4l2m2m",
++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
++ .priv_size = sizeof(DeintV4L2M2MContext),
++ .init = &deint_v4l2m2m_init,
++ .uninit = &deint_v4l2m2m_uninit,
++ .query_formats = &deint_v4l2m2m_query_formats,
++ .inputs = deint_v4l2m2m_inputs,
++ .outputs = deint_v4l2m2m_outputs,
++ .priv_class = &deinterlace_v4l2m2m_class,
++ .activate = deint_v4l2m2m_activate,
++};
+--- /dev/null
++++ b/libavfilter/vf_unsand.c
+@@ -0,0 +1,234 @@
++/*
++ * Copyright (c) 2007 Bobby Bingham
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * format and noformat video filters
++ */
++
++#include <string.h>
++
++#include "libavutil/internal.h"
++#include "libavutil/mem.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/opt.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct UnsandContext {
++ const AVClass *class;
++} UnsandContext;
++
++static av_cold void uninit(AVFilterContext *ctx)
++{
++// UnsandContext *s = ctx->priv;
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++// UnsandContext *s = ctx->priv;
++
++ return 0;
++}
++
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++ AVFilterLink * const outlink = link->dst->outputs[0];
++ AVFrame *out = NULL;
++ int rv = 0;
++
++ if (outlink->format == in->format) {
++ // If nothing to do then do nothing
++ out = in;
++ }
++ else
++ {
++ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
++ {
++ rv = AVERROR(ENOMEM);
++ goto fail;
++ }
++ if (av_rpi_sand_to_planar_frame(out, in) != 0)
++ {
++ rv = -1;
++ goto fail;
++ }
++
++ av_frame_free(&in);
++ }
++
++ return ff_filter_frame(outlink, out);
++
++fail:
++ av_frame_free(&out);
++ av_frame_free(&in);
++ return rv;
++}
++
++#if 0
++static void dump_fmts(const AVFilterFormats * fmts)
++{
++ int i;
++ if (fmts== NULL) {
++ printf("NULL\n");
++ return;
++ }
++ for (i = 0; i < fmts->nb_formats; ++i) {
++ printf(" %d", fmts->formats[i]);
++ }
++ printf("\n");
++}
++#endif
++
++static int query_formats(AVFilterContext *ctx)
++{
++// UnsandContext *s = ctx->priv;
++ int ret;
++
++ // If we aren't connected at both ends then just do nothing
++ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
++ return 0;
++
++// printf("Unsand: %s in: ", __func__);
++// dump_fmts(ctx->inputs[0]->in_formats);
++// printf("Unsand: %s out: ", __func__);
++// dump_fmts(ctx->outputs[0]->out_formats);
++
++ // Our output formats depend on our input formats and we can't/don't
++ // want to convert between bit depths so we need to wait for the source
++ // to have an opinion before we do
++ if (ctx->inputs[0]->in_formats == NULL)
++ return AVERROR(EAGAIN);
++
++ // Accept anything
++ if (ctx->inputs[0]->out_formats == NULL &&
++ (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
++ return ret;
++
++ // Filter out sand formats
++
++ // Generate a container if we don't already have one
++ if (ctx->outputs[0]->in_formats == NULL)
++ {
++ // Somewhat rubbish way of ensuring we have a good structure
++ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
++ AVFilterFormats *formats = ff_make_format_list(out_fmts);
++
++ if (formats == NULL)
++ return AVERROR(ENOMEM);
++ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
++ return ret;
++ }
++
++ // Replace old format list with new filtered list derived from what our
++ // input says it can do
++ {
++ const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
++ AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
++ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
++ int i;
++ int n = 0;
++ int seen_420p = 0;
++ int seen_420p10 = 0;
++
++ for (i = 0; i < src_ff->nb_formats; ++i) {
++ const enum AVPixelFormat f = src_ff->formats[i];
++
++ switch (f){
++ case AV_PIX_FMT_YUV420P:
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ if (!seen_420p) {
++ seen_420p = 1;
++ dst_fmts[n++] = AV_PIX_FMT_YUV420P;
++ }
++ break;
++ case AV_PIX_FMT_SAND64_10:
++ case AV_PIX_FMT_YUV420P10:
++ case AV_PIX_FMT_RPI4_10:
++ if (!seen_420p10) {
++ seen_420p10 = 1;
++ dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
++ }
++ break;
++ default:
++ dst_fmts[n++] = f;
++ break;
++ }
++ }
++
++ av_freep(&dst_ff->formats);
++ dst_ff->formats = dst_fmts;
++ dst_ff->nb_formats = n;
++ }
++
++// printf("Unsand: %s calc: ", __func__);
++// dump_fmts(ctx->outputs[0]->in_formats);
++
++ return 0;
++}
++
++
++#define OFFSET(x) offsetof(UnsandContext, x)
++static const AVOption unsand_options[] = {
++ { NULL }
++};
++
++
++AVFILTER_DEFINE_CLASS(unsand);
++
++static const AVFilterPad avfilter_vf_unsand_inputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .filter_frame = filter_frame,
++ },
++ { NULL }
++};
++
++static const AVFilterPad avfilter_vf_unsand_outputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO
++ },
++ { NULL }
++};
++
++AVFilter ff_vf_unsand = {
++ .name = "unsand",
++ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
++
++ .init = init,
++ .uninit = uninit,
++
++ .query_formats = query_formats,
++
++ .priv_size = sizeof(UnsandContext),
++ .priv_class = &unsand_class,
++
++ .inputs = avfilter_vf_unsand_inputs,
++ .outputs = avfilter_vf_unsand_outputs,
++};
++
+--- a/libavformat/utils.c
++++ b/libavformat/utils.c
+@@ -3051,6 +3051,40 @@ static int has_codec_parameters(AVStream
+ return 1;
+ }
+
++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
++// This should be quite general purpose but avoid possible conflicts
++// by limiting usage to cases wehere we know it works.
++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
++{
++ // Only try fallback if we know it is supported (HEVC only)
++ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
++ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
++ int err;
++
++ // Failed to find fallback or we are already at the fallback
++ if (new_codec == NULL || new_codec == old_codec)
++ {
++ return AVERROR_DECODER_NOT_FOUND;
++ }
++
++ // * This may be dodgy - header says to not use this fn,
++ // especially if we are going to reopen the context...
++ // (but it does seem to work for our cases)
++ if (avcodec_is_open(avctx)) {
++ avcodec_close(avctx);
++ }
++
++ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
++ {
++ return err;
++ }
++
++ return 0;
++}
++#else
++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
++#endif
++
+ /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
+ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+ const AVPacket *avpkt, AVDictionary **options)
+@@ -3085,7 +3119,11 @@ static int try_decode_frame(AVFormatCont
+ av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
+ if (s->codec_whitelist)
+ av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
+- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
++ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
++ {
++ // Try fallback if if looks worth a try
++ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
++ }
+ if (!options)
+ av_dict_free(&thread_opt);
+ if (ret < 0) {
+@@ -3116,6 +3154,14 @@ static int try_decode_frame(AVFormatCont
+ if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+ avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
+ ret = avcodec_send_packet(avctx, &pkt);
++
++ // If we are going to want to fall back we should know here
++ if (ret == AVERROR_DECODER_NOT_FOUND) {
++ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
++ break;
++ continue;
++ }
++
+ if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
+ break;
+ if (ret >= 0)
+@@ -3726,9 +3772,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
+ // Try to just open decoders, in case this is enough to get parameters.
+ if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
+ if (codec && !avctx->codec)
+- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
+- av_log(ic, AV_LOG_WARNING,
+- "Failed to open codec in %s\n",__FUNCTION__);
++ {
++ int err;
++
++ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
++ {
++ if (err == AVERROR_DECODER_NOT_FOUND) {
++ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
++ }
++ if (err < 0) {
++ av_log(ic, AV_LOG_WARNING,
++ "Failed to open codec in %s\n",__FUNCTION__);
++ }
++ }
++ }
+ }
+ if (!options)
+ av_dict_free(&thread_opt);
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -68,6 +68,7 @@ HEADERS = adler32.h
+ rational.h \
+ replaygain.h \
+ ripemd.h \
++ rpi_sand_fns.h \
+ samplefmt.h \
+ sha.h \
+ sha512.h \
+@@ -86,6 +87,7 @@ HEADERS = adler32.h
+ tx.h \
+
+ HEADERS-$(CONFIG_LZO) += lzo.h
++HEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h
+
+ ARCH_HEADERS = bswap.h \
+ intmath.h \
+@@ -180,6 +182,7 @@ OBJS-$(CONFIG_LZO)
+ OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o
+ OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o
+ OBJS-$(CONFIG_QSV) += hwcontext_qsv.o
++OBJS-$(CONFIG_SAND) += rpi_sand_fns.o
+ OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o
+ OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o
+ OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o
+--- a/libavutil/aarch64/Makefile
++++ b/libavutil/aarch64/Makefile
+@@ -1,4 +1,6 @@
+ OBJS += aarch64/cpu.o \
+ aarch64/float_dsp_init.o \
+
+-NEON-OBJS += aarch64/float_dsp_neon.o
++NEON-OBJS += aarch64/float_dsp_neon.o \
++ aarch64/rpi_sand_neon.o \
++
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.S
+@@ -0,0 +1,781 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#include "asm.S"
++
++// void ff_rpi_sand8_lines_to_planar_y8(
++// uint8_t * dest, : x0
++// unsigned int dst_stride, : w1
++// const uint8_t * src, : x2
++// unsigned int src_stride1, : w3, always 128
++// unsigned int src_stride2, : w4
++// unsigned int _x, : w5
++// unsigned int y, : w6
++// unsigned int _w, : w7
++// unsigned int h); : [sp, #0]
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++ // w15 contains the number of rows we need to process
++ ldr w15, [sp, #0]
++
++ // w8 will contain the number of blocks per row
++ // w8 = floor(_w/stride1)
++ // stride1 is assumed to always be 128
++ mov w8, w1
++ lsr w8, w8, #7
++
++ // in case the width of the image is not a multiple of 128, there will
++ // be an incomplete block at the end of every row
++ // w9 contains the number of pixels stored within this block
++ // w9 = _w - w8 * 128
++ lsl w9, w8, #7
++ sub w9, w7, w9
++
++ // this is the value we have to add to the src pointer after reading a complete block
++ // it will move the address to the start of the next block
++ // w10 = stride2 * stride1 - stride1
++ mov w10, w4
++ lsl w10, w10, #7
++ sub w10, w10, #128
++
++ // w11 is the row offset, meaning the start offset of the first block of every collumn
++ // this will be increased with stride1 within every iteration of the row_loop
++ eor w11, w11, w11
++
++ // w12 = 0, processed row count
++ eor w12, w12, w12
++row_loop:
++ // start of the first block within the current row
++ // x13 = row offset + src
++ mov x13, x2
++ add x13, x13, x11
++
++ // w14 = 0, processed block count
++ eor w14, w14, w14
++
++ cmp w8, #0
++ beq no_main_y8
++
++block_loop:
++ // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
++ // fortunately these aren't callee saved ones, meaning we don't need to backup them
++ ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64
++ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64
++
++ // write these registers back to the destination vector and increase the dst address by 128
++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], #64
++
++ // move the source register to the beginning of the next block (x13 = src + block offset)
++ add x13, x13, x10
++ // increase the block counter
++ add w14, w14, #1
++
++ // continue with the block_loop if we haven't copied all full blocks yet
++ cmp w8, w14
++ bgt block_loop
++
++ // handle the last block at the end of each row
++ // at most 127 byte values copied from src to dst
++no_main_y8:
++ eor w5, w5, w5 // i = 0
++incomplete_block_loop_y8:
++ cmp w5, w9
++ bge incomplete_block_loop_end_y8
++
++ ldrb w6, [x13]
++ strb w6, [x0]
++ add x13, x13, #1
++ add x0, x0, #1
++
++ add w5, w5, #1
++ b incomplete_block_loop_y8
++incomplete_block_loop_end_y8:
++
++
++ // increase the row offset by 128 (stride1)
++ add w11, w11, #128
++ // increment the row counter
++ add w12, w12, #1
++
++ // process the next row if we haven't finished yet
++ cmp w15, w12
++ bgt row_loop
++
++ ret
++endfunc
++
++
++
++// void ff_rpi_sand8_lines_to_planar_c8(
++// uint8_t * dst_u, : x0
++// unsigned int dst_stride_u, : w1 == width
++// uint8_t * dst_v, : x2
++// unsigned int dst_stride_v, : w3 == width
++// const uint8_t * src, : x4
++// unsigned int stride1, : w5 == 128
++// unsigned int stride2, : w6
++// unsigned int _x, : w7
++// unsigned int y, : [sp, #0]
++// unsigned int _w, : [sp, #8]
++// unsigned int h); : [sp, #16]
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++ // w7 = width
++ ldr w7, [sp, #8]
++
++ // w15 contains the number of rows we need to process
++ // counts down
++ ldr w15, [sp, #16]
++
++ // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
++ mov w8, w7
++ lsr w8, w8, #6
++
++ // number of pixels in block at the end of every row
++ // w9 = _w - (w8 * 64)
++ lsl w9, w8, #6
++ sub w9, w7, w9
++
++ // Skip at the end of the line to account for stride
++ sub w12, w1, w7
++
++ // address delta to the beginning of the next block
++ // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
++ lsl w10, w6, #7
++ sub w10, w10, #128
++
++ // w11 = row address start offset = 0
++ eor w11, w11, w11
++
++row_loop_c8:
++ // start of the first block within the current row
++ // x13 = row offset + src
++ mov x13, x4
++ add x13, x13, x11
++
++ // w14 = 0, processed block count
++ eor w14, w14, w14
++
++ cmp w8, #0
++ beq no_main_c8
++
++block_loop_c8:
++ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values
++ ld2 { v0.16b, v1.16b }, [x13], #32
++ ld2 { v2.16b, v3.16b }, [x13], #32
++ ld2 { v4.16b, v5.16b }, [x13], #32
++ ld2 { v6.16b, v7.16b }, [x13], #32
++
++ // swap register so that we can write them out with a single instruction
++ mov v16.16b, v1.16b
++ mov v17.16b, v3.16b
++ mov v18.16b, v5.16b
++ mov v1.16b, v2.16b
++ mov v2.16b, v4.16b
++ mov v3.16b, v6.16b
++ mov v4.16b, v16.16b
++ mov v5.16b, v17.16b
++ mov v6.16b, v18.16b
++
++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x2], #64
++
++ // increment row counter and move src to the beginning of the next block
++ add w14, w14, #1
++ add x13, x13, x10
++
++ // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
++ cmp w8, w14
++ bgt block_loop_c8
++
++no_main_c8:
++ // handle incomplete block at the end of every row
++ eor w5, w5, w5 // point counter, this might be
++incomplete_block_loop_c8:
++ cmp w5, w9
++ bge incomplete_block_loop_end_c8
++
++ ldrb w1, [x13]
++ strb w1, [x0]
++ add x13, x13, #1
++
++ ldrb w1, [x13]
++ strb w1, [x2]
++ add x13, x13, #1
++
++ add x0, x0, #1
++ add x2, x2, #1
++
++ add w5, w5, #1
++ b incomplete_block_loop_c8
++incomplete_block_loop_end_c8:
++
++ // increase row_offset by stride1
++ add w11, w11, #128
++ add x0, x0, w12, sxtw
++ add x2, x2, w12, sxtw
++
++ // jump to row_Loop_c8 iff the row count is small than the height
++ subs w15, w15, #1
++ bgt row_loop_c8
++
++ ret
++endfunc
++
++//void ff_rpi_sand30_lines_to_planar_c16(
++// uint8_t * dst_u, // [x0]
++// unsigned int dst_stride_u, // [w1] == _w*2
++// uint8_t * dst_v, // [x2]
++// unsigned int dst_stride_v, // [w3] == _w*2
++// const uint8_t * src, // [x4]
++// unsigned int stride1, // [w5] == 128
++// unsigned int stride2, // [w6]
++// unsigned int _x, // [w7] == 0
++// unsigned int y, // [sp, #0] == 0
++// unsigned int _w, // [sp, #8] -> w3
++// unsigned int h); // [sp, #16] -> w7
++
++.macro rpi_sand30_lines_to_planar_c16_block_half
++ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64
++
++ xtn v4.4h, v0.4s
++ ushr v0.4s, v0.4s, #10
++ xtn v5.4h, v0.4s
++ ushr v0.4s, v0.4s, #10
++ xtn v6.4h, v0.4s
++ xtn2 v4.8h, v1.4s
++ ushr v1.4s, v1.4s, #10
++ xtn2 v5.8h, v1.4s
++ ushr v1.4s, v1.4s, #10
++ xtn2 v6.8h, v1.4s
++ and v4.16b, v4.16b, v16.16b
++ and v5.16b, v5.16b, v16.16b
++ and v6.16b, v6.16b, v16.16b
++ st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
++
++ xtn v4.4h, v2.4s
++ ushr v2.4s, v2.4s, #10
++ xtn v5.4h, v2.4s
++ ushr v2.4s, v2.4s, #10
++ xtn v6.4h, v2.4s
++ xtn2 v4.8h, v3.4s
++ ushr v3.4s, v3.4s, #10
++ xtn2 v5.8h, v3.4s
++ ushr v3.4s, v3.4s, #10
++ xtn2 v6.8h, v3.4s
++ and v4.16b, v4.16b, v16.16b
++ and v5.16b, v5.16b, v16.16b
++ and v6.16b, v6.16b, v16.16b
++ st3 { v4.8h, v5.8h, v6.8h }, [sp]
++ sub sp, sp, #48
++.endm
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++ stp x19, x20, [sp, #-48]!
++ stp x21, x22, [sp, #16]
++ stp x23, x24, [sp, #32]
++
++ ldr w3, [sp, #48+8] // w3 = width
++ ldr w7, [sp, #48+16] // w7 = height
++
++ // reserve space on the stack for intermediate results
++ sub sp, sp, #256
++
++ // number of 128byte blocks per row, w8 = width / 48
++ mov w9, #48
++ udiv w8, w3, w9
++
++ // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
++ mul w9, w8, w9
++ sub w9, w3, w9
++
++ // row offset, the beginning of the next row to process
++ eor w10, w10, w10
++
++ // offset to the beginning of the next block, w11 = stride2 * 128 - 128
++ lsl w11, w6, #7
++ sub w11, w11, #128
++
++ // decrease the height by one and in case of remaining pixels increase the block count by one
++ sub w7, w7, #1
++ cmp w9, #0
++ cset w19, ne // w19 == 1 iff reamining pixels != 0
++ add w8, w8, w19
++
++ // bytes we have to move dst back by at the end of every row
++ mov w21, #48*2
++ mul w21, w21, w8
++ sub w21, w1, w21
++
++ mov w20, #0 // w20 = flag, last row processed
++
++ mov x12, #0x03ff03ff03ff03ff
++ dup v16.2d, x12
++
++ // iterate through rows, row counter = w12 = 0
++ eor w12, w12, w12
++row_loop_c16:
++ cmp w12, w7
++ bge row_loop_c16_fin
++
++ // address of row data = src + row_offset
++ mov x13, x4
++ add x13, x13, x10
++
++ eor w14, w14, w14
++block_loop_c16:
++ cmp w14, w8
++ bge block_loop_c16_fin
++
++ rpi_sand30_lines_to_planar_c16_block_half
++
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp]
++ sub sp, sp, #64
++
++ st1 { v0.8h }, [x0], #16
++ st1 { v2.8h }, [x0], #16
++ st1 { v4.8h }, [x0], #16
++ st1 { v1.8h }, [x2], #16
++ st1 { v3.8h }, [x2], #16
++ st1 { v5.8h }, [x2], #16
++
++ rpi_sand30_lines_to_planar_c16_block_half
++
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp]
++ sub sp, sp, #64
++
++ st1 { v0.8h }, [x0], #16
++ st1 { v2.8h }, [x0], #16
++ st1 { v4.8h }, [x0], #16
++ st1 { v1.8h }, [x2], #16
++ st1 { v3.8h }, [x2], #16
++ st1 { v5.8h }, [x2], #16
++
++ add x13, x13, x11 // offset to next block
++ add w14, w14, #1
++ b block_loop_c16
++block_loop_c16_fin:
++
++ add w10, w10, #128
++ add w12, w12, #1
++ add x0, x0, w21, sxtw // move dst pointers back by x21
++ add x2, x2, w21, sxtw
++ b row_loop_c16
++row_loop_c16_fin:
++
++ cmp w20, #1
++ beq row_loop_c16_fin2
++ mov w20, #1
++ sub w8, w8, w19 // decrease block count by w19
++ add w7, w7, #1 // increase height
++ b row_loop_c16
++
++row_loop_c16_fin2:
++ sub x0, x0, w21, sxtw // readd x21 in case of the last row
++ sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
++
++ // last incomplete block to be finished
++ // read operations are fine, stride2 is more than large enough even if rem_pix is 0
++ rpi_sand30_lines_to_planar_c16_block_half
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp], #32
++ rpi_sand30_lines_to_planar_c16_block_half
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp]
++ sub sp, sp, #160
++
++ mov x4, sp
++ eor w20, w20, w20
++rem_pix_c16_loop:
++ cmp w20, w9
++ bge rem_pix_c16_fin
++
++ ldr w22, [x4], #4
++ str w22, [x0], #2
++ lsr w22, w22, #16
++ str w22, [x2], #2
++
++ add w20, w20, #1
++ b rem_pix_c16_loop
++rem_pix_c16_fin:
++
++ add sp, sp, #256
++
++ ldp x23, x24, [sp, #32]
++ ldp x21, x22, [sp, #16]
++ ldp x19, x20, [sp], #48
++ ret
++endfunc
++
++
++
++//void ff_rpi_sand30_lines_to_planar_p010(
++// uint8_t * dest,
++// unsigned int dst_stride,
++// const uint8_t * src,
++// unsigned int src_stride1,
++// unsigned int src_stride2,
++// unsigned int _x,
++// unsigned int y,
++// unsigned int _w,
++// unsigned int h);
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++// uint8_t * dest, : x0
++// unsigned int dst_stride, : w1
++// const uint8_t * src, : x2
++// unsigned int src_stride1, : w3, always 128
++// unsigned int src_stride2, : w4
++// unsigned int _x, : w5
++// unsigned int y, : w6
++// unsigned int _w, : w7
++// unsigned int h); : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++ lsl w4, w4, #7
++ sub w4, w4, #64
++ sub w1, w1, w7, lsl #1
++ uxtw x6, w6
++ add x8, x2, x6, lsl #7
++ ldr w6, [sp, #0]
++
++10:
++ mov x2, x8
++ mov w5, w7
++1:
++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++ subs w5, w5, #96
++
++ // v0, v1
++
++ shrn v18.4h, v0.4s, #14
++ xtn v16.4h, v0.4s
++ shrn v17.4h, v0.4s, #10
++
++ shrn2 v18.8h, v1.4s, #14
++ xtn2 v16.8h, v1.4s
++ shrn2 v17.8h, v1.4s, #10
++
++ ushr v18.8h, v18.8h, #6
++ bic v16.8h, #0xfc, lsl #8
++ bic v17.8h, #0xfc, lsl #8
++
++ // v2, v3
++
++ shrn v21.4h, v2.4s, #14
++ xtn v19.4h, v2.4s
++ shrn v20.4h, v2.4s, #10
++
++ shrn2 v21.8h, v3.4s, #14
++ xtn2 v19.8h, v3.4s
++ shrn2 v20.8h, v3.4s, #10
++
++ ushr v21.8h, v21.8h, #6
++ bic v19.8h, #0xfc, lsl #8
++ bic v20.8h, #0xfc, lsl #8
++
++ // v4, v5
++
++ shrn v24.4h, v4.4s, #14
++ xtn v22.4h, v4.4s
++ shrn v23.4h, v4.4s, #10
++
++ shrn2 v24.8h, v5.4s, #14
++ xtn2 v22.8h, v5.4s
++ shrn2 v23.8h, v5.4s, #10
++
++ ushr v24.8h, v24.8h, #6
++ bic v22.8h, #0xfc, lsl #8
++ bic v23.8h, #0xfc, lsl #8
++
++ // v6, v7
++
++ shrn v27.4h, v6.4s, #14
++ xtn v25.4h, v6.4s
++ shrn v26.4h, v6.4s, #10
++
++ shrn2 v27.8h, v7.4s, #14
++ xtn2 v25.8h, v7.4s
++ shrn2 v26.8h, v7.4s, #10
++
++ ushr v27.8h, v27.8h, #6
++ bic v25.8h, #0xfc, lsl #8
++ bic v26.8h, #0xfc, lsl #8
++
++ blt 2f
++
++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48
++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48
++ st3 {v22.8h, v23.8h, v24.8h}, [x0], #48
++ st3 {v25.8h, v26.8h, v27.8h}, [x0], #48
++
++ bne 1b
++
++11:
++ subs w6, w6, #1
++ add x0, x0, w1, uxtw
++ add x8, x8, #128
++ bne 10b
++
++ ret
++
++// Partial final write
++2:
++ cmp w5, #48-96
++ blt 1f
++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48
++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48
++ beq 11b
++ mov v16.16b, v22.16b
++ mov v17.16b, v23.16b
++ sub w5, w5, #48
++ mov v18.16b, v24.16b
++ mov v19.16b, v25.16b
++ mov v20.16b, v26.16b
++ mov v21.16b, v27.16b
++1:
++ cmp w5, #24-96
++ blt 1f
++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48
++ beq 11b
++ mov v16.16b, v19.16b
++ mov v17.16b, v20.16b
++ sub w5, w5, #24
++ mov v18.16b, v21.16b
++1:
++ cmp w5, #12-96
++ blt 1f
++ st3 {v16.4h, v17.4h, v18.4h}, [x0], #24
++ beq 11b
++ mov v16.2d[0], v16.2d[1]
++ sub w5, w5, #12
++ mov v17.2d[0], v17.2d[1]
++ mov v18.2d[0], v18.2d[1]
++1:
++ cmp w5, #6-96
++ blt 1f
++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6
++ st3 {v16.h, v17.h, v18.h}[1], [x0], #6
++ beq 11b
++ mov v16.2s[0], v16.2s[1]
++ sub w5, w5, #6
++ mov v17.2s[0], v17.2s[1]
++ mov v18.2s[0], v18.2s[1]
++1:
++ cmp w5, #3-96
++ blt 1f
++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6
++ beq 11b
++ mov v16.4h[0], v16.4h[1]
++ sub w5, w5, #3
++ mov v17.4h[0], v17.4h[1]
++1:
++ cmp w5, #2-96
++ blt 1f
++ st2 {v16.h, v17.h}[0], [x0], #4
++ b 11b
++1:
++ st1 {v16.h}[0], [x0], #2
++ b 11b
++
++endfunc
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++// uint8_t * dest, : x0
++// unsigned int dst_stride, : w1
++// const uint8_t * src, : x2
++// unsigned int src_stride1, : w3, always 128
++// unsigned int src_stride2, : w4
++// unsigned int _x, : w5
++// unsigned int y, : w6
++// unsigned int _w, : w7
++// unsigned int h); : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++ lsl w4, w4, #7
++ sub w4, w4, #64
++ sub w1, w1, w7
++ uxtw x6, w6
++ add x8, x2, x6, lsl #7
++ ldr w6, [sp, #0]
++
++10:
++ mov x2, x8
++ mov w5, w7
++1:
++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++ subs w5, w5, #96
++
++ // v0, v1
++
++ shrn v18.4h, v0.4s, #16
++ xtn v16.4h, v0.4s
++ shrn v17.4h, v0.4s, #12
++
++ shrn2 v18.8h, v1.4s, #16
++ xtn2 v16.8h, v1.4s
++ shrn2 v17.8h, v1.4s, #12
++
++ shrn v18.8b, v18.8h, #6
++ shrn v16.8b, v16.8h, #2
++ xtn v17.8b, v17.8h
++
++ // v2, v3
++
++ shrn v21.4h, v2.4s, #16
++ xtn v19.4h, v2.4s
++ shrn v20.4h, v2.4s, #12
++
++ shrn2 v21.8h, v3.4s, #16
++ xtn2 v19.8h, v3.4s
++ shrn2 v20.8h, v3.4s, #12
++
++ shrn2 v18.16b, v21.8h, #6
++ shrn2 v16.16b, v19.8h, #2
++ xtn2 v17.16b, v20.8h
++
++ // v4, v5
++
++ shrn v24.4h, v4.4s, #16
++ xtn v22.4h, v4.4s
++ shrn v23.4h, v4.4s, #12
++
++ shrn2 v24.8h, v5.4s, #16
++ xtn2 v22.8h, v5.4s
++ shrn2 v23.8h, v5.4s, #12
++
++ shrn v21.8b, v24.8h, #6
++ shrn v19.8b, v22.8h, #2
++ xtn v20.8b, v23.8h
++
++ // v6, v7
++
++ shrn v27.4h, v6.4s, #16
++ xtn v25.4h, v6.4s
++ shrn v26.4h, v6.4s, #12
++
++ shrn2 v27.8h, v7.4s, #16
++ xtn2 v25.8h, v7.4s
++ shrn2 v26.8h, v7.4s, #12
++
++ shrn2 v21.16b, v27.8h, #6
++ shrn2 v19.16b, v25.8h, #2
++ xtn2 v20.16b, v26.8h
++
++ blt 2f
++
++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48
++ st3 {v19.16b, v20.16b, v21.16b}, [x0], #48
++
++ bne 1b
++
++11:
++ subs w6, w6, #1
++ add x0, x0, w1, uxtw
++ add x8, x8, #128
++ bne 10b
++
++ ret
++
++// Partial final write
++2:
++ cmp w5, #48-96
++ blt 1f
++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48
++ beq 11b
++ mov v16.16b, v22.16b
++ mov v17.16b, v23.16b
++ sub w5, w5, #48
++ mov v18.16b, v24.16b
++1:
++ cmp w5, #24-96
++ blt 1f
++ st3 {v16.8b, v17.8b, v18.8b}, [x0], #24
++ beq 11b
++ mov v16.2d[0], v16.2d[1]
++ sub w5, w5, #24
++ mov v17.2d[0], v17.2d[1]
++ mov v18.2d[0], v18.2d[1]
++1:
++ cmp w5, #12-96
++ blt 1f
++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[2], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[3], [x0], #3
++ beq 11b
++ mov v16.2s[0], v16.2s[1]
++ sub w5, w5, #12
++ mov v17.2s[0], v17.2s[1]
++ mov v18.2s[0], v18.2s[1]
++1:
++ cmp w5, #6-96
++ blt 1f
++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3
++ beq 11b
++ mov v16.4h[0], v16.4h[1]
++ sub w5, w5, #6
++ mov v17.4h[0], v17.4h[1]
++ mov v18.4h[0], v18.4h[1]
++1:
++ cmp w5, #3-96
++ blt 1f
++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3
++ beq 11b
++ mov v16.8b[0], v16.8b[1]
++ sub w5, w5, #3
++ mov v17.8b[0], v17.8b[1]
++1:
++ cmp w5, #2-96
++ blt 1f
++ st2 {v16.b, v17.b}[0], [x0], #2
++ b 11b
++1:
++ st1 {v16.b}[0], [x0], #1
++ b 11b
++
++endfunc
++
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.h
+@@ -0,0 +1,59 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#pragma once
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
++ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
++ unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++#ifdef __cplusplus
++}
++#endif
++
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o
+
+ NEON-OBJS += arm/float_dsp_init_neon.o \
+ arm/float_dsp_neon.o \
++ arm/rpi_sand_neon.o \
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,925 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "libavutil/arm/asm.S"
++
++
++@ General notes:
++@ Having done some timing on this in sand8->y8 (Pi4)
++@ vst1 (680fps) is a bit faster than vstm (660fps)
++@ vldm (680fps) is noticably faster than vld1 (480fps)
++@ (or it might be that a mix is what is required)
++@
++@ At least on a Pi4 it is no more expensive to have a single auto-inc register
++@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
++@ the latter was better)
++@
++@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
++@ the memory is uncached.
++@ As these are Sand -> planar we can assume that src is going to be aligned but
++@ it is possible that dest isn't (converting to .yuv or other packed format).
++@ Luckily vst1 is faster than vstm :-) so all is well
++@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
++@ .8 stores would let us do non-word aligned stores into uncached but it
++@ probably isn't worth it.
++
++
++
++
++@ void ff_rpi_sand128b_stripe_to_8_10(
++@ uint8_t * dest, // [r0]
++@ const uint8_t * src1, // [r1]
++@ const uint8_t * src2, // [r2]
++@ unsigned int lines); // [r3]
++
++.macro stripe2_to_8, bit_depth
++ vpush {q4-q7}
++1:
++ vldm r1!, {q0-q7}
++ subs r3, #1
++ vldm r2!, {q8-q15}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vqrshrn.u16 d4, q4, #\bit_depth - 8
++ vqrshrn.u16 d5, q5, #\bit_depth - 8
++ vqrshrn.u16 d6, q6, #\bit_depth - 8
++ vqrshrn.u16 d7, q7, #\bit_depth - 8
++ vqrshrn.u16 d8, q8, #\bit_depth - 8
++ vqrshrn.u16 d9, q9, #\bit_depth - 8
++ vqrshrn.u16 d10, q10, #\bit_depth - 8
++ vqrshrn.u16 d11, q11, #\bit_depth - 8
++ vqrshrn.u16 d12, q12, #\bit_depth - 8
++ vqrshrn.u16 d13, q13, #\bit_depth - 8
++ vqrshrn.u16 d14, q14, #\bit_depth - 8
++ vqrshrn.u16 d15, q15, #\bit_depth - 8
++ vstm r0!, {q0-q7}
++ bne 1b
++ vpop {q4-q7}
++ bx lr
++.endm
++
++function ff_rpi_sand128b_stripe_to_8_10, export=1
++ stripe2_to_8 10
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_y8(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++ push {r4-r8, lr} @ +24 L
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ lsl r3, #7
++ sub r1, r6
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++ mov lr, #0
++1:
++ vldm r2, {q8-q15}
++ add r2, r3
++ subs r5, #128
++ blt 2f
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ vst1.8 {d20, d21, d22, d23}, [r0]!
++ vst1.8 {d24, d25, d26, d27}, [r0]!
++ vst1.8 {d28, d29, d30, d31}, [r0]!
++ bne 1b
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #64-128
++ blt 1f
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ vst1.8 {d20, d21, d22, d23}, [r0]!
++ beq 11b
++ vmov q8, q12
++ vmov q9, q13
++ sub r5, #64
++ vmov q10, q14
++ vmov q11, q15
++1:
++ cmp r5, #32-128
++ blt 1f
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ beq 11b
++ vmov q8, q10
++ sub r5, #32
++ vmov q9, q11
++1:
++ cmp r5, #16-128
++ blt 1f
++ vst1.8 {d16, d17}, [r0]!
++ beq 11b
++ sub r5, #16
++ vmov q8, q9
++1:
++ cmp r5, #8-128
++ blt 1f
++ vst1.8 {d16}, [r0]!
++ beq 11b
++ sub r5, #8
++ vmov d16, d17
++1:
++ cmp r5, #4-128
++ blt 1f
++ vst1.32 {d16[0]}, [r0]!
++ beq 11b
++ sub r5, #4
++ vshr.u64 d16, #32
++1:
++ cmp r5, #2-128
++ blt 1f
++ vst1.16 {d16[0]}, [r0]!
++ beq 11b
++ vst1.8 {d16[2]}, [r0]!
++ b 11b
++1:
++ vst1.8 {d16[0]}, [r0]!
++ b 11b
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_c8(
++@ uint8_t * dst_u, // [r0]
++@ unsigned int dst_stride_u, // [r1]
++@ uint8_t * dst_v, // [r2]
++@ unsigned int dst_stride_v, // [r3]
++@ const uint8_t * src, // [sp, #0] -> r4, r5
++@ unsigned int stride1, // [sp, #4] 128
++@ unsigned int stride2, // [sp, #8] -> r8
++@ unsigned int _x, // [sp, #12] 0
++@ unsigned int y, // [sp, #16] (r7 in prefix)
++@ unsigned int _w, // [sp, #20] -> r12, r6
++@ unsigned int h); // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++ push {r4-r8, lr} @ +24
++
++ ldr r5, [sp, #24]
++ ldr r8, [sp, #32]
++ ldr r7, [sp, #40]
++ ldr r6, [sp, #44]
++ lsl r8, #7
++ add r5, r5, r7, lsl #7
++ sub r1, r1, r6
++ sub r3, r3, r6
++ ldr r7, [sp, #48]
++ vpush {q4-q7}
++
++10:
++ mov r4, r5
++ mov r12, r6
++1:
++ subs r12, #64
++ vldm r4, {q0-q7}
++ add r4, r8
++ it gt
++ vldmgt r4, {q8-q15}
++ add r4, r8
++
++ vuzp.8 q0, q1
++ vuzp.8 q2, q3
++ vuzp.8 q4, q5
++ vuzp.8 q6, q7
++
++ vuzp.8 q8, q9
++ vuzp.8 q10, q11
++ vuzp.8 q12, q13
++ vuzp.8 q14, q15
++ subs r12, #64
++
++ @ Rearrange regs so we can use vst1 with 4 regs
++ vswp q1, q2
++ vswp q5, q6
++ vswp q9, q10
++ vswp q13, q14
++ blt 2f
++
++ vst1.8 {d0, d1, d2, d3 }, [r0]!
++ vst1.8 {d8, d9, d10, d11}, [r0]!
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ vst1.8 {d24, d25, d26, d27}, [r0]!
++
++ vst1.8 {d4, d5, d6, d7 }, [r2]!
++ vst1.8 {d12, d13, d14, d15}, [r2]!
++ vst1.8 {d20, d21, d22, d23}, [r2]!
++ vst1.8 {d28, d29, d30, d31}, [r2]!
++ bne 1b
++11:
++ subs r7, #1
++ add r5, #128
++ add r0, r1
++ add r2, r3
++ bne 10b
++ vpop {q4-q7}
++ pop {r4-r8,pc}
++
++2:
++ cmp r12, #64-128
++ blt 1f
++ vst1.8 {d0, d1, d2, d3 }, [r0]!
++ vst1.8 {d8, d9, d10, d11}, [r0]!
++ vst1.8 {d4, d5, d6, d7 }, [r2]!
++ vst1.8 {d12, d13, d14, d15}, [r2]!
++ beq 11b
++ sub r12, #64
++ vmov q0, q8
++ vmov q1, q9
++ vmov q2, q10
++ vmov q3, q11
++ vmov q4, q12
++ vmov q5, q13
++ vmov q6, q14
++ vmov q7, q15
++1:
++ cmp r12, #32-128
++ blt 1f
++ vst1.8 {d0, d1, d2, d3 }, [r0]!
++ vst1.8 {d4, d5, d6, d7 }, [r2]!
++ beq 11b
++ sub r12, #32
++ vmov q0, q4
++ vmov q1, q5
++ vmov q2, q6
++ vmov q3, q7
++1:
++ cmp r12, #16-128
++ blt 1f
++ vst1.8 {d0, d1 }, [r0]!
++ vst1.8 {d4, d5 }, [r2]!
++ beq 11b
++ sub r12, #16
++ vmov q0, q1
++ vmov q2, q3
++1:
++ cmp r12, #8-128
++ blt 1f
++ vst1.8 {d0}, [r0]!
++ vst1.8 {d4}, [r2]!
++ beq 11b
++ sub r12, #8
++ vmov d0, d1
++ vmov d4, d5
++1:
++ cmp r12, #4-128
++ blt 1f
++ vst1.32 {d0[0]}, [r0]!
++ vst1.32 {d4[0]}, [r2]!
++ beq 11b
++ sub r12, #4
++ vmov s0, s1
++ vmov s8, s9
++1:
++ cmp r12, #2-128
++ blt 1f
++ vst1.16 {d0[0]}, [r0]!
++ vst1.16 {d4[0]}, [r2]!
++ beq 11b
++ vst1.8 {d0[2]}, [r0]!
++ vst1.8 {d4[2]}, [r2]!
++ b 11b
++1:
++ vst1.8 {d0[0]}, [r0]!
++ vst1.8 {d4[0]}, [r2]!
++ b 11b
++endfunc
++
++
++
++@ void ff_rpi_sand30_lines_to_planar_y16(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++ push {r4-r8, lr} @ +24
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ mov r12, #48
++ sub r3, #1
++ lsl r3, #7
++ sub r1, r1, r6, lsl #1
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++ mov lr, #0
++1:
++ vldm r2!, {q10-q13}
++ add lr, #64
++
++ vshrn.u32 d4 , q10, #14 @ Cannot vshrn.u32 #20!
++ ands lr, #127
++ vshrn.u32 d2, q10, #10
++ vmovn.u32 d0, q10
++
++ vshrn.u32 d5, q11, #14
++ it eq
++ addeq r2, r3
++ vshrn.u32 d3, q11, #10
++ vmovn.u32 d1, q11
++
++ subs r5, #48
++ vshr.u16 q2, #6
++ vbic.u16 q0, #0xfc00
++ vbic.u16 q1, #0xfc00
++
++ vshrn.u32 d20, q12, #14
++ vshrn.u32 d18, q12, #10
++ vmovn.u32 d16, q12
++
++ vshrn.u32 d21, q13, #14
++ vshrn.u32 d19, q13, #10
++ vmovn.u32 d17, q13
++
++ vshr.u16 q10, #6
++ vbic.u16 q8, #0xfc00
++ vbic.u16 q9 , #0xfc00
++ blt 2f
++
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4], r12
++ vst3.16 {d16, d18, d20}, [r0], r12
++ vst3.16 {d17, d19, d21}, [r4], r12
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #24-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4]
++ beq 11b
++ vmov q0, q8
++ sub r5, #24
++ vmov q1, q9
++ vmov q2, q10
++1:
++ cmp r5, #12-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0]!
++ beq 11b
++ vmov d0, d1
++ sub r5, #12
++ vmov d2, d3
++ vmov d4, d5
++1:
++ cmp r5, #6-48
++ add r4, r0, #6 @ avoid [r0]! on sequential instructions
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]
++ vst3.16 {d0[1], d2[1], d4[1]}, [r4]
++ add r0, #12
++ beq 11b
++ vmov s0, s1
++ sub r5, #6
++ vmov s4, s5
++ vmov s8, s9
++1:
++ cmp r5, #3-48
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]!
++ beq 11b
++ sub r5, #3
++ vshr.u32 d0, #16
++ vshr.u32 d2, #16
++1:
++ cmp r5, #2-48
++ blt 1f
++ vst2.16 {d0[0], d2[0]}, [r0]!
++ b 11b
++1:
++ vst1.16 {d0[0]}, [r0]!
++ b 11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_c16(
++@ uint8_t * dst_u, // [r0]
++@ unsigned int dst_stride_u, // [r1]
++@ uint8_t * dst_v, // [r2]
++@ unsigned int dst_stride_v, // [r3]
++@ const uint8_t * src, // [sp, #0] -> r4, r5
++@ unsigned int stride1, // [sp, #4] 128
++@ unsigned int stride2, // [sp, #8] -> r8
++@ unsigned int _x, // [sp, #12] 0
++@ unsigned int y, // [sp, #16] (r7 in prefix)
++@ unsigned int _w, // [sp, #20] -> r6, r9
++@ unsigned int h); // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++ push {r4-r10, lr} @ +32
++ ldr r5, [sp, #32]
++ ldr r8, [sp, #40]
++ ldr r7, [sp, #48]
++ ldr r9, [sp, #52]
++ mov r12, #48
++ sub r8, #1
++ lsl r8, #7
++ add r5, r5, r7, lsl #7
++ sub r1, r1, r9, lsl #1
++ sub r3, r3, r9, lsl #1
++ ldr r7, [sp, #56]
++10:
++ mov lr, #0
++ mov r4, r5
++ mov r6, r9
++1:
++ vldm r4!, {q0-q3}
++ add lr, #64
++
++ @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
++ vshrn.u32 d20, q0, #14
++ vmovn.u32 d18, q0
++ vshrn.u32 d0, q0, #10
++ ands lr, #127
++
++ vshrn.u32 d21, q1, #14
++ vmovn.u32 d19, q1
++ vshrn.u32 d1, q1, #10
++
++ vshrn.u32 d22, q2, #10
++ vmovn.u32 d2, q2
++ vshrn.u32 d4, q2, #14
++
++ add r10, r0, #24
++ vshrn.u32 d23, q3, #10
++ vmovn.u32 d3, q3
++ vshrn.u32 d5, q3, #14
++
++ it eq
++ addeq r4, r8
++ vuzp.16 q0, q11
++ vuzp.16 q9, q1
++ vuzp.16 q10, q2
++
++ @ q0 V0, V3,..
++ @ q9 U0, U3...
++ @ q10 U1, U4...
++ @ q11 U2, U5,..
++ @ q1 V1, V4,
++ @ q2 V2, V5,..
++
++ subs r6, #24
++ vbic.u16 q11, #0xfc00
++ vbic.u16 q9, #0xfc00
++ vshr.u16 q10, #6
++ vshr.u16 q2, #6
++ vbic.u16 q0, #0xfc00
++ vbic.u16 q1, #0xfc00
++
++ blt 2f
++
++ vst3.16 {d18, d20, d22}, [r0], r12
++ vst3.16 {d19, d21, d23}, [r10]
++ add r10, r2, #24
++ vst3.16 {d0, d2, d4}, [r2], r12
++ vst3.16 {d1, d3, d5}, [r10]
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r5, #128
++ add r0, r1
++ add r2, r3
++ bne 10b
++
++ pop {r4-r10, pc}
++
++@ Partial final write
++2:
++ cmp r6, #-12
++ blt 1f
++ vst3.16 {d18, d20, d22}, [r0]!
++ vst3.16 {d0, d2, d4}, [r2]!
++ beq 11b
++ vmov d18, d19
++ vmov d20, d21
++ vmov d22, d23
++ sub r6, #12
++ vmov d0, d1
++ vmov d2, d3
++ vmov d4, d5
++1:
++ cmp r6, #-18
++ @ Rezip here as it makes the remaining tail handling easier
++ vzip.16 d0, d18
++ vzip.16 d2, d20
++ vzip.16 d4, d22
++ blt 1f
++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]!
++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]!
++ vst3.16 {d0[3], d2[3], d4[3]}, [r0]!
++ vst3.16 {d0[2], d2[2], d4[2]}, [r2]!
++ beq 11b
++ vmov d0, d18
++ vmov d2, d20
++ sub r6, #6
++ vmov d4, d22
++1:
++ cmp r6, #-21
++ blt 1f
++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]!
++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]!
++ beq 11b
++ vmov s4, s5
++ sub r6, #3
++ vmov s0, s1
++1:
++ cmp r6, #-22
++ blt 1f
++ vst2.16 {d0[1], d2[1]}, [r0]!
++ vst2.16 {d0[0], d2[0]}, [r2]!
++ b 11b
++1:
++ vst1.16 {d0[1]}, [r0]!
++ vst1.16 {d0[0]}, [r2]!
++ b 11b
++
++endfunc
++
++@ void ff_rpi_sand30_lines_to_planar_p010(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_p010, export=1
++ push {r4-r8, lr} @ +24
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ mov r12, #48
++ vmov.u16 q15, #0xffc0
++ sub r3, #1
++ lsl r3, #7
++ sub r1, r1, r6, lsl #1
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++ mov lr, #0
++1:
++ vldm r2!, {q10-q13}
++ add lr, #64
++
++ vshl.u32 q14, q10, #6
++ ands lr, #127
++ vshrn.u32 d4, q10, #14
++ vshrn.u32 d2, q10, #4
++ vmovn.u32 d0, q14
++
++ vshl.u32 q14, q11, #6
++ it eq
++ addeq r2, r3
++ vshrn.u32 d5, q11, #14
++ vshrn.u32 d3, q11, #4
++ vmovn.u32 d1, q14
++
++ subs r5, #48
++ vand q2, q15
++ vand q1, q15
++ vand q0, q15
++
++ vshl.u32 q14, q12, #6
++ vshrn.u32 d20, q12, #14
++ vshrn.u32 d18, q12, #4
++ vmovn.u32 d16, q14
++
++ vshl.u32 q14, q13, #6
++ vshrn.u32 d21, q13, #14
++ vshrn.u32 d19, q13, #4
++ vmovn.u32 d17, q14
++
++ vand q10, q15
++ vand q9, q15
++ vand q8, q15
++ blt 2f
++
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4], r12
++ vst3.16 {d16, d18, d20}, [r0], r12
++ vst3.16 {d17, d19, d21}, [r4], r12
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #24-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4]
++ beq 11b
++ vmov q0, q8
++ sub r5, #24
++ vmov q1, q9
++ vmov q2, q10
++1:
++ cmp r5, #12-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0]!
++ beq 11b
++ vmov d0, d1
++ sub r5, #12
++ vmov d2, d3
++ vmov d4, d5
++1:
++ cmp r5, #6-48
++ add r4, r0, #6 @ avoid [r0]! on sequential instructions
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]
++ vst3.16 {d0[1], d2[1], d4[1]}, [r4]
++ add r0, #12
++ beq 11b
++ vmov s0, s1
++ sub r5, #6
++ vmov s4, s5
++ vmov s8, s9
++1:
++ cmp r5, #3-48
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]!
++ beq 11b
++ sub r5, #3
++ vshr.u32 d0, #16
++ vshr.u32 d2, #16
++1:
++ cmp r5, #2-48
++ blt 1f
++ vst2.16 {d0[0], d2[0]}, [r0]!
++ b 11b
++1:
++ vst1.16 {d0[0]}, [r0]!
++ b 11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_y8(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++ push {r4-r8, lr} @ +24
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ mov r12, #48
++ lsl r3, #7
++ sub r1, r1, r6
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++1:
++ vldm r2, {q8-q15}
++
++ subs r5, #96
++
++ vmovn.u32 d0, q8
++ vshrn.u32 d2, q8, #12
++ vshrn.u32 d4, q8, #16 @ Cannot vshrn.u32 #20!
++
++ add r2, r3
++
++ vmovn.u32 d1, q9
++ vshrn.u32 d3, q9, #12
++ vshrn.u32 d5, q9, #16
++
++ pld [r2, #0]
++
++ vshrn.u16 d0, q0, #2
++ vmovn.u16 d1, q1
++ vshrn.u16 d2, q2, #6
++
++ vmovn.u32 d16, q10
++ vshrn.u32 d18, q10, #12
++ vshrn.u32 d20, q10, #16
++
++ vmovn.u32 d17, q11
++ vshrn.u32 d19, q11, #12
++ vshrn.u32 d21, q11, #16
++
++ pld [r2, #64]
++
++ vshrn.u16 d4, q8, #2
++ vmovn.u16 d5, q9
++ vshrn.u16 d6, q10, #6
++
++ vmovn.u32 d16, q12
++ vshrn.u32 d18, q12, #12
++ vshrn.u32 d20, q12, #16
++
++ vmovn.u32 d17, q13
++ vshrn.u32 d19, q13, #12
++ vshrn.u32 d21, q13, #16
++
++ vshrn.u16 d16, q8, #2
++ vmovn.u16 d17, q9
++ vshrn.u16 d18, q10, #6
++
++ vmovn.u32 d20, q14
++ vshrn.u32 d22, q14, #12
++ vshrn.u32 d24, q14, #16
++
++ vmovn.u32 d21, q15
++ vshrn.u32 d23, q15, #12
++ vshrn.u32 d25, q15, #16
++
++ vshrn.u16 d20, q10, #2
++ vmovn.u16 d21, q11
++ vshrn.u16 d22, q12, #6
++
++ blt 2f
++
++ vst3.8 {d0, d1, d2}, [r0], r12
++ vst3.8 {d4, d5, d6}, [r4], r12
++ vst3.8 {d16, d17, d18}, [r0], r12
++ vst3.8 {d20, d21, d22}, [r4], r12
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #48-96
++ blt 1f
++ vst3.8 {d0, d1, d2}, [r0], r12
++ vst3.8 {d4, d5, d6}, [r4], r12
++ beq 11b
++ vmov q0, q8
++ vmov q2, q10
++ sub r5, #48
++ vmov d2, d18
++ vmov d6, d22
++1:
++ cmp r5, #24-96
++ blt 1f
++ vst3.8 {d0, d1, d2}, [r0]!
++ beq 11b
++ vmov q0, q2
++ sub r5, #24
++ vmov d2, d6
++1:
++ cmp r5, #12-96
++ blt 1f
++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]!
++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]!
++ vst3.8 {d0[2], d1[2], d2[2]}, [r0]!
++ vst3.8 {d0[3], d1[3], d2[3]}, [r0]!
++ beq 11b
++ vmov s0, s1
++ sub r5, #12
++ vmov s2, s3
++ vmov s4, s5
++1:
++ cmp r5, #6-96
++ blt 1f
++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]!
++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]!
++ add r0, #12
++ beq 11b
++ vshr.u32 d0, #16
++ sub r5, #6
++ vshr.u32 d1, #16
++ vshr.u32 d2, #16
++1:
++ cmp r5, #3-96
++ blt 1f
++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]!
++ beq 11b
++ sub r5, #3
++ vshr.u32 d0, #8
++ vshr.u32 d1, #8
++1:
++ cmp r5, #2-96
++ blt 1f
++ vst2.8 {d0[0], d1[0]}, [r0]!
++ b 11b
++1:
++ vst1.8 {d0[0]}, [r0]!
++ b 11b
++
++endfunc
++
++
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.h
+@@ -0,0 +1,110 @@
++/*
++Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_ARM_SAND_NEON_H
++#define AVUTIL_ARM_SAND_NEON_H
++
++void ff_rpi_sand128b_stripe_to_8_10(
++ uint8_t * dest, // [r0]
++ const uint8_t * src1, // [r1]
++ const uint8_t * src2, // [r2]
++ unsigned int lines); // [r3]
++
++void ff_rpi_sand8_lines_to_planar_y8(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++void ff_rpi_sand8_lines_to_planar_c8(
++ uint8_t * dst_u, // [r0]
++ unsigned int dst_stride_u, // [r1]
++ uint8_t * dst_v, // [r2]
++ unsigned int dst_stride_v, // [r3]
++ const uint8_t * src, // [sp, #0] -> r4, r5
++ unsigned int stride1, // [sp, #4] 128
++ unsigned int stride2, // [sp, #8] -> r8
++ unsigned int _x, // [sp, #12] 0
++ unsigned int y, // [sp, #16] (r7 in prefix)
++ unsigned int _w, // [sp, #20] -> r12, r6
++ unsigned int h); // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y16(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_c16(
++ uint8_t * dst_u, // [r0]
++ unsigned int dst_stride_u, // [r1]
++ uint8_t * dst_v, // [r2]
++ unsigned int dst_stride_v, // [r3]
++ const uint8_t * src, // [sp, #0] -> r4, r5
++ unsigned int stride1, // [sp, #4] 128
++ unsigned int stride2, // [sp, #8] -> r8
++ unsigned int _x, // [sp, #12] 0
++ unsigned int y, // [sp, #16] (r7 in prefix)
++ unsigned int _w, // [sp, #20] -> r6, r9
++ unsigned int h); // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_p010(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y8(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++#endif // AVUTIL_ARM_SAND_NEON_H
++
+--- a/libavutil/frame.c
++++ b/libavutil/frame.c
+@@ -16,6 +16,8 @@
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
++#include "config.h"
++
+ #include "channel_layout.h"
+ #include "avassert.h"
+ #include "buffer.h"
+@@ -26,6 +28,9 @@
+ #include "mem.h"
+ #include "samplefmt.h"
+ #include "hwcontext.h"
++#if CONFIG_SAND
++#include "rpi_sand_fns.h"
++#endif
+
+ #if FF_API_FRAME_GET_SET
+ MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
+@@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *fra
+ (frame->crop_top + frame->crop_bottom) >= frame->height)
+ return AVERROR(ERANGE);
+
++#if CONFIG_SAND
++ // Sand cannot be cropped - do not try
++ if (av_rpi_is_sand_format(frame->format))
++ return 0;
++#endif
++
+ desc = av_pix_fmt_desc_get(frame->format);
+ if (!desc)
+ return AVERROR_BUG;
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *fra
+ */
+ const char *av_frame_side_data_name(enum AVFrameSideDataType type);
+
++
++static inline int av_frame_cropped_width(const AVFrame * const frame)
++{
++ return frame->width - (frame->crop_left + frame->crop_right);
++}
++static inline int av_frame_cropped_height(const AVFrame * const frame)
++{
++ return frame->height - (frame->crop_top + frame->crop_bottom);
++}
++
+ /**
+ * @}
+ */
+--- a/libavutil/hwcontext_drm.c
++++ b/libavutil/hwcontext_drm.c
+@@ -19,8 +19,10 @@
+ #include <fcntl.h>
+ #include <sys/mman.h>
+ #include <unistd.h>
++#include <sys/ioctl.h>
+
+ #include <drm.h>
++#include <libdrm/drm_fourcc.h>
+ #include <xf86drm.h>
+
+ #include "avassert.h"
+@@ -28,6 +30,11 @@
+ #include "hwcontext_drm.h"
+ #include "hwcontext_internal.h"
+ #include "imgutils.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include <linux/mman.h>
++#include <linux/dma-buf.h>
++#include <linux/dma-heap.h>
+
+
+ static void drm_device_free(AVHWDeviceContext *hwdev)
+@@ -43,6 +50,11 @@ static int drm_device_create(AVHWDeviceC
+ AVDRMDeviceContext *hwctx = hwdev->hwctx;
+ drmVersionPtr version;
+
++ if (device == NULL) {
++ hwctx->fd = -1;
++ return 0;
++ }
++
+ hwctx->fd = open(device, O_RDWR);
+ if (hwctx->fd < 0)
+ return AVERROR(errno);
+@@ -85,18 +97,37 @@ static int drm_get_buffer(AVHWFramesCont
+ typedef struct DRMMapping {
+ // Address and length of each mmap()ed region.
+ int nb_regions;
++ unsigned int dmaflags;
+ void *address[AV_DRM_MAX_PLANES];
+ size_t length[AV_DRM_MAX_PLANES];
++ int fds[AV_DRM_MAX_PLANES];
+ } DRMMapping;
+
++static int dmasync(const int fd, const unsigned int flags)
++{
++ struct dma_buf_sync sync = {
++ .flags = flags
++ };
++ while (ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
++ const int err = errno;
++ if (errno == EINTR)
++ continue;
++ av_log(NULL, AV_LOG_WARNING, "%s: ioctl failed: flags=%#x\n", __func__, flags);
++ return -err;
++ }
++ return 0;
++}
++
+ static void drm_unmap_frame(AVHWFramesContext *hwfc,
+ HWMapDescriptor *hwmap)
+ {
+ DRMMapping *map = hwmap->priv;
+ int i;
+
+- for (i = 0; i < map->nb_regions; i++)
++ for (i = 0; i < map->nb_regions; i++) {
+ munmap(map->address[i], map->length[i]);
++ dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
++ }
+
+ av_free(map);
+ }
+@@ -114,15 +145,28 @@ static int drm_map_frame(AVHWFramesConte
+ if (!map)
+ return AVERROR(ENOMEM);
+
++ for (i = 0; i < AV_DRM_MAX_PLANES; i++)
++ map->fds[i] = -1;
++
+ mmap_prot = 0;
+- if (flags & AV_HWFRAME_MAP_READ)
++ if (flags & AV_HWFRAME_MAP_READ) {
++ map->dmaflags |= DMA_BUF_SYNC_READ;
+ mmap_prot |= PROT_READ;
+- if (flags & AV_HWFRAME_MAP_WRITE)
++ }
++ if (flags & AV_HWFRAME_MAP_WRITE) {
++ map->dmaflags |= DMA_BUF_SYNC_WRITE;
+ mmap_prot |= PROT_WRITE;
++ }
++
++ if (dst->format == AV_PIX_FMT_NONE)
++ dst->format = hwfc->sw_format;
+
+ av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES);
+ for (i = 0; i < desc->nb_objects; i++) {
+- addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED,
++ dmasync(desc->objects[i].fd, DMA_BUF_SYNC_START | map->dmaflags);
++ map->fds[i] = desc->objects[i].fd;
++
++ addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED | MAP_POPULATE,
+ desc->objects[i].fd, 0);
+ if (addr == MAP_FAILED) {
+ err = AVERROR(errno);
+@@ -151,6 +195,23 @@ static int drm_map_frame(AVHWFramesConte
+
+ dst->width = src->width;
+ dst->height = src->height;
++ dst->crop_top = src->crop_top;
++ dst->crop_bottom = src->crop_bottom;
++ dst->crop_left = src->crop_left;
++ dst->crop_right = src->crop_right;
++
++#if CONFIG_SAND
++ // Rework for sand frames
++ if (av_rpi_is_sand_frame(dst)) {
++ // As it stands the sand formats hold stride2 in linesize[3]
++ // linesize[0] & [1] contain stride1 which is always 128 for everything we do
++ // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
++ dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
++ dst->linesize[0] = 128;
++ dst->linesize[1] = 128;
++ // *** Are we sure src->height is actually what we want ???
++ }
++#endif
+
+ err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
+ &drm_unmap_frame, map);
+@@ -160,7 +221,9 @@ static int drm_map_frame(AVHWFramesConte
+ return 0;
+
+ fail:
+- for (i = 0; i < desc->nb_objects; i++) {
++ for (i = 0; i < AV_DRM_MAX_PLANES; i++) {
++ if (map->fds[i] != -1)
++ dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
+ if (map->address[i])
+ munmap(map->address[i], map->length[i]);
+ }
+@@ -172,16 +235,29 @@ static int drm_transfer_get_formats(AVHW
+ enum AVHWFrameTransferDirection dir,
+ enum AVPixelFormat **formats)
+ {
+- enum AVPixelFormat *pix_fmts;
++ enum AVPixelFormat *p;
+
+- pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
+- if (!pix_fmts)
++ p = *formats = av_malloc_array(3, sizeof(*p));
++ if (!p)
+ return AVERROR(ENOMEM);
+
+- pix_fmts[0] = ctx->sw_format;
+- pix_fmts[1] = AV_PIX_FMT_NONE;
++ // **** Offer native sand too ????
++ *p++ =
++#if CONFIG_SAND
++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
++ AV_PIX_FMT_YUV420P :
++ ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
++ AV_PIX_FMT_YUV420P10LE :
++#endif
++ ctx->sw_format;
++
++#if CONFIG_SAND
++ if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
++ *p++ = AV_PIX_FMT_NV12;
++#endif
+
+- *formats = pix_fmts;
++ *p = AV_PIX_FMT_NONE;
+ return 0;
+ }
+
+@@ -197,18 +273,63 @@ static int drm_transfer_data_from(AVHWFr
+ map = av_frame_alloc();
+ if (!map)
+ return AVERROR(ENOMEM);
+- map->format = dst->format;
+
++ // Map to default
++ map->format = AV_PIX_FMT_NONE;
+ err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
+ if (err)
+ goto fail;
+
+- map->width = dst->width;
+- map->height = dst->height;
++#if 0
++ av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
++ map->hwfc_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
++ map->width, map->height,
++ map->linesize[0],
++ map->linesize[1],
++ map->linesize[2],
++ map->linesize[3],
++ dst->width, dst->height,
++ dst->linesize[0],
++ dst->linesize[1],
++ dst->linesize[2]);
++#endif
++#if CONFIG_SAND
++ if (av_rpi_is_sand_frame(map)) {
++ // Preserve crop - later ffmpeg code assumes that we have in that it
++ // overwrites any crop that we create with the old values
++ unsigned int stride2 = map->linesize[3];
++ const unsigned int w = FFMIN(dst->width, map->width);
++ const unsigned int h = FFMIN(dst->height, map->height);
++
++ map->crop_top = 0;
++ map->crop_bottom = 0;
++ map->crop_left = 0;
++ map->crop_right = 0;
++
++ if (av_rpi_sand_to_planar_frame(dst, map) != 0)
++ {
++ av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
++ err = AVERROR(EINVAL);
++ goto fail;
++ }
++
++ dst->width = w;
++ dst->height = h;
++ }
++ else
++#endif
++ {
++ // Kludge mapped h/w s.t. frame_copy works
++ map->width = dst->width;
++ map->height = dst->height;
++ err = av_frame_copy(dst, map);
++ }
+
+- err = av_frame_copy(dst, map);
+ if (err)
++ {
++ av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
+ goto fail;
++ }
+
+ err = 0;
+ fail:
+@@ -223,7 +344,10 @@ static int drm_transfer_data_to(AVHWFram
+ int err;
+
+ if (src->width > hwfc->width || src->height > hwfc->height)
++ {
++ av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
+ return AVERROR(EINVAL);
++ }
+
+ map = av_frame_alloc();
+ if (!map)
+--- a/libavutil/pixdesc.c
++++ b/libavutil/pixdesc.c
+@@ -2371,6 +2371,50 @@ static const AVPixFmtDescriptor av_pix_f
+ .name = "vulkan",
+ .flags = AV_PIX_FMT_FLAG_HWACCEL,
+ },
++ [AV_PIX_FMT_SAND128] = {
++ .name = "sand128",
++ .nb_components = 3,
++ .log2_chroma_w = 1,
++ .log2_chroma_h = 1,
++ .comp = {
++ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */
++ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */
++ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */
++ },
++ .flags = 0,
++ },
++ [AV_PIX_FMT_SAND64_10] = {
++ .name = "sand64_10",
++ .nb_components = 3,
++ .log2_chroma_w = 1,
++ .log2_chroma_h = 1,
++ .comp = {
++ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */
++ { 1, 4, 0, 0, 10, 3, 9, 1 }, /* U */
++ { 1, 4, 2, 0, 10, 3, 9, 3 }, /* V */
++ },
++ .flags = 0,
++ },
++ [AV_PIX_FMT_SAND64_16] = {
++ .name = "sand64_16",
++ .nb_components = 3,
++ .log2_chroma_w = 1,
++ .log2_chroma_h = 1,
++ .comp = {
++ { 0, 2, 0, 0, 16, 0, 15, 1 }, /* Y */
++ { 1, 4, 0, 0, 16, 3, 15, 1 }, /* U */
++ { 1, 4, 2, 0, 16, 3, 15, 3 }, /* V */
++ },
++ .flags = 0,
++ },
++ [AV_PIX_FMT_RPI4_8] = {
++ .name = "rpi4_8",
++ .flags = AV_PIX_FMT_FLAG_HWACCEL,
++ },
++ [AV_PIX_FMT_RPI4_10] = {
++ .name = "rpi4_10",
++ .flags = AV_PIX_FMT_FLAG_HWACCEL,
++ },
+ };
+ #if FF_API_PLUS1_MINUS1
+ FF_ENABLE_DEPRECATION_WARNINGS
+--- a/libavutil/pixfmt.h
++++ b/libavutil/pixfmt.h
+@@ -357,6 +357,12 @@ enum AVPixelFormat {
+
+ AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
+ AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
++// RPI - not on ifdef so can be got at by calling progs
++ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++ AV_PIX_FMT_RPI4_8,
++ AV_PIX_FMT_RPI4_10,
+
+ AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+ };
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,227 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x = _x;
++ const unsigned int w = _w;
++ const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
++ src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if ((x & ~mask) == ((x + w) & ~mask)) {
++ // All in one sand stripe
++ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++ memcpy(dst, p, w);
++ }
++ }
++ else
++ {
++ // Two+ stripe
++ const unsigned int sstride = stride1 * stride2;
++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ const uint8_t * p2 = p1 + sstride - (x & mask);
++ const unsigned int w1 = stride1 - (x & mask);
++ const unsigned int w3 = (x + w) & mask;
++ const unsigned int w2 = w - (w1 + w3);
++
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++ unsigned int j;
++ const uint8_t * p = p2;
++ uint8_t * d = dst;
++ memcpy(d, p1, w1);
++ d += w1;
++ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++ memcpy(d, p, stride1);
++ }
++ memcpy(d, p, w3);
++ }
++ }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x = _x * 2;
++ const unsigned int w = _w * 2;
++ const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
++ src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if ((x & ~mask) == ((x + w) & ~mask)) {
++ // All in one sand stripe
++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++ pixel * du = (pixel *)dst_u;
++ pixel * dv = (pixel *)dst_v;
++ const pixel * p = (const pixel *)p1;
++ for (unsigned int k = 0; k < w; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ }
++ }
++ else
++ {
++ // Two+ stripe
++ const unsigned int sstride = stride1 * stride2;
++ const unsigned int sstride_p = (sstride - stride1) / PW;
++
++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ const uint8_t * p2 = p1 + sstride - (x & mask);
++ const unsigned int w1 = stride1 - (x & mask);
++ const unsigned int w3 = (x + w) & mask;
++ const unsigned int w2 = w - (w1 + w3);
++
++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++ unsigned int j;
++ const pixel * p = (const pixel *)p1;
++ pixel * du = (pixel *)dst_u;
++ pixel * dv = (pixel *)dst_v;
++ for (unsigned int k = 0; k < w1; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ }
++ for (unsigned int k = 0; k < w3; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ }
++ }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++ unsigned int stride1, unsigned int stride2,
++ const uint8_t * src_u, const unsigned int src_stride_u,
++ const uint8_t * src_v, const unsigned int src_stride_v,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x = _x * 2;
++ const unsigned int w = _w * 2;
++ const unsigned int mask = stride1 - 1;
++ if ((x & ~mask) == ((x + w) & ~mask)) {
++ // All in one sand stripe
++ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++ const pixel * su = (const pixel *)src_u;
++ const pixel * sv = (const pixel *)src_v;
++ pixel * p = (pixel *)p1;
++ for (unsigned int k = 0; k < w; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ }
++ }
++ else
++ {
++ // Two+ stripe
++ const unsigned int sstride = stride1 * stride2;
++ const unsigned int sstride_p = (sstride - stride1) / PW;
++
++ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ const uint8_t * p2 = p1 + sstride - (x & mask);
++ const unsigned int w1 = stride1 - (x & mask);
++ const unsigned int w3 = (x + w) & mask;
++ const unsigned int w2 = w - (w1 + w3);
++
++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++ unsigned int j;
++ const pixel * su = (const pixel *)src_u;
++ const pixel * sv = (const pixel *)src_v;
++ pixel * p = (pixel *)p1;
++ for (unsigned int k = 0; k < w1; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ }
++ for (unsigned int k = 0; k < w3; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ }
++ }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,445 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "config.h"
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++#include "frame.h"
++
++#if ARCH_ARM && HAVE_NEON
++#include "arm/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#elif ARCH_AARCH64 && HAVE_NEON
++#include "aarch64/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#else
++#define HAVE_SAND_ASM 0
++#endif
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++ const unsigned int rnd = (1 << shr) >> 1;
++ const uint16_t * src = (const uint16_t *)_src;
++
++ for (; n != 0; --n) {
++ *dst++ = (*src++ + rnd) >> shr;
++ }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++ unsigned int rnd = (1 << shr) >> 1;
++ const unsigned int mask = ((1 << shr) - 1);
++ const uint16_t * src = (const uint16_t *)_src;
++
++ for (; n != 0; --n) {
++ rnd = *src++ + (rnd & mask);
++ *dst++ = rnd >> shr;
++ }
++}
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++ const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++ const unsigned int x1 = ((_x + _w) / 3) * 4;
++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++ const unsigned int mask = stride1 - 1;
++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if (x0 == x1) {
++ // *******************
++ // Partial single word xfer
++ return;
++ }
++
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++ {
++ unsigned int x = x0;
++ const uint32_t * p = (const uint32_t *)p0;
++ uint16_t * d = (uint16_t *)dst;
++
++ if (xskip0 != 0) {
++ const uint32_t p3 = *p++;
++
++ if (xskip0 == 1)
++ *d++ = (p3 >> 10) & 0x3ff;
++ *d++ = (p3 >> 20) & 0x3ff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ while (x != x1) {
++ const uint32_t p3 = *p++;
++ *d++ = p3 & 0x3ff;
++ *d++ = (p3 >> 10) & 0x3ff;
++ *d++ = (p3 >> 20) & 0x3ff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ if (xrem1 != 0) {
++ const uint32_t p3 = *p;
++
++ *d++ = p3 & 0x3ff;
++ if (xrem1 == 2)
++ *d++ = (p3 >> 10) & 0x3ff;
++ }
++ }
++}
++
++
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
++ const unsigned int xskip0 = _x - (x0 >> 3) * 3;
++ const unsigned int x1 = ((_x + _w) / 3) * 8;
++ const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
++ const unsigned int mask = stride1 - 1;
++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
++ src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if (x0 == x1) {
++ // *******************
++ // Partial single word xfer
++ return;
++ }
++
++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
++ {
++ unsigned int x = x0;
++ const uint32_t * p = (const uint32_t *)p0;
++ uint16_t * du = (uint16_t *)dst_u;
++ uint16_t * dv = (uint16_t *)dst_v;
++
++ if (xskip0 != 0) {
++ const uint32_t p3a = *p++;
++ const uint32_t p3b = *p++;
++
++ if (xskip0 == 1)
++ {
++ *du++ = (p3a >> 20) & 0x3ff;
++ *dv++ = (p3b >> 0) & 0x3ff;
++ }
++ *du++ = (p3b >> 10) & 0x3ff;
++ *dv++ = (p3b >> 20) & 0x3ff;
++
++ if (((x += 8) & mask) == 0)
++ p += slice_inc;
++ }
++
++ while (x != x1) {
++ const uint32_t p3a = *p++;
++ const uint32_t p3b = *p++;
++
++ *du++ = p3a & 0x3ff;
++ *dv++ = (p3a >> 10) & 0x3ff;
++ *du++ = (p3a >> 20) & 0x3ff;
++ *dv++ = p3b & 0x3ff;
++ *du++ = (p3b >> 10) & 0x3ff;
++ *dv++ = (p3b >> 20) & 0x3ff;
++
++ if (((x += 8) & mask) == 0)
++ p += slice_inc;
++ }
++
++ if (xrem1 != 0) {
++ const uint32_t p3a = *p++;
++ const uint32_t p3b = *p++;
++
++ *du++ = p3a & 0x3ff;
++ *dv++ = (p3a >> 10) & 0x3ff;
++ if (xrem1 == 2)
++ {
++ *du++ = (p3a >> 20) & 0x3ff;
++ *dv++ = p3b & 0x3ff;
++ }
++ }
++ }
++}
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// single lose bottom 2 bits truncation
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++ const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++ const unsigned int x1 = ((_x + _w) / 3) * 4;
++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++ const unsigned int mask = stride1 - 1;
++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if (x0 == x1) {
++ // *******************
++ // Partial single word xfer
++ return;
++ }
++
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++ {
++ unsigned int x = x0;
++ const uint32_t * p = (const uint32_t *)p0;
++ uint8_t * d = dst;
++
++ if (xskip0 != 0) {
++ const uint32_t p3 = *p++;
++
++ if (xskip0 == 1)
++ *d++ = (p3 >> 12) & 0xff;
++ *d++ = (p3 >> 22) & 0xff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ while (x != x1) {
++ const uint32_t p3 = *p++;
++ *d++ = (p3 >> 2) & 0xff;
++ *d++ = (p3 >> 12) & 0xff;
++ *d++ = (p3 >> 22) & 0xff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ if (xrem1 != 0) {
++ const uint32_t p3 = *p;
++
++ *d++ = (p3 >> 2) & 0xff;
++ if (xrem1 == 2)
++ *d++ = (p3 >> 12) & 0xff;
++ }
++ }
++}
++
++
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++ unsigned int w, unsigned int h, const unsigned int shr)
++{
++ const unsigned int n = dst_stride1 / 2;
++ unsigned int j;
++
++ // This is true for our current layouts
++ av_assert0(dst_stride1 == src_stride1);
++
++ // As we have the same stride1 for src & dest and src is wider than dest
++ // then if we loop on src we can always write contiguously to dest
++ // We make no effort to copy an exact width - round up to nearest src stripe
++ // as we will always have storage in dest for that
++
++#if ARCH_ARM && HAVE_NEON
++ if (shr == 3 && src_stride1 == 128) {
++ for (j = 0; j + n < w; j += dst_stride1) {
++ uint8_t * d = dst + j * dst_stride2;
++ const uint8_t * s1 = src + j * 2 * src_stride2;
++ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++ ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++ }
++ }
++ else
++#endif
++ {
++ for (j = 0; j + n < w; j += dst_stride1) {
++ uint8_t * d = dst + j * dst_stride2;
++ const uint8_t * s1 = src + j * 2 * src_stride2;
++ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++ cpy16_to_8(d, s1, n, shr);
++ cpy16_to_8(d + n, s2, n, shr);
++ }
++ }
++ }
++
++ // Fix up a trailing dest half stripe
++ if (j < w) {
++ uint8_t * d = dst + j * dst_stride2;
++ const uint8_t * s1 = src + j * 2 * src_stride2;
++
++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++ cpy16_to_8(d, s1, n, shr);
++ }
++ }
++}
++
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
++{
++ const int w = av_frame_cropped_width(src);
++ const int h = av_frame_cropped_height(src);
++ const int x = src->crop_left;
++ const int y = src->crop_top;
++
++ // We will crop as part of the conversion
++ dst->crop_top = 0;
++ dst->crop_left = 0;
++ dst->crop_bottom = 0;
++ dst->crop_right = 0;
++
++ switch (src->format){
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ switch (dst->format){
++ case AV_PIX_FMT_YUV420P:
++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
++ dst->data[2], dst->linesize[2],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w/2, h/2);
++ break;
++ case AV_PIX_FMT_NV12:
++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w, h/2);
++ break;
++ default:
++ return -1;
++ }
++ break;
++ case AV_PIX_FMT_SAND64_10:
++ switch (dst->format){
++ case AV_PIX_FMT_YUV420P10:
++ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x*2, y, w*2, h);
++ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
++ dst->data[2], dst->linesize[2],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y/2, w, h/2);
++ break;
++ default:
++ return -1;
++ }
++ break;
++ case AV_PIX_FMT_RPI4_10:
++ switch (dst->format){
++ case AV_PIX_FMT_YUV420P10:
++ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
++ dst->data[2], dst->linesize[2],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w/2, h/2);
++ break;
++ case AV_PIX_FMT_NV12:
++ av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w, h/2);
++ break;
++ default:
++ return -1;
++ }
++ break;
++ default:
++ return -1;
++ }
++
++ return av_frame_copy_props(dst, src);
++}
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,188 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++ unsigned int stride1, unsigned int stride2,
++ const uint8_t * src_u, const unsigned int src_stride_u,
++ const uint8_t * src_v, const unsigned int src_stride_v,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++ unsigned int stride1, unsigned int stride2,
++ const uint8_t * src_u, const unsigned int src_stride_u,
++ const uint8_t * src_v, const unsigned int src_stride_v,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++ unsigned int w, unsigned int h, const unsigned int shr);
++
++
++// dst must contain required pixel format & allocated data buffers
++// Cropping on the src buffer will be honoured and dst crop will be set to zero
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++#ifdef RPI_ZC_SAND128_ONLY
++ // If we are sure we only only support 128 byte sand formats replace the
++ // var with a constant which should allow for better optimisation
++ return 128;
++#else
++ return frame->linesize[0];
++#endif
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++ return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++ return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++ return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
++{
++ return (frame->format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++ return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++ const unsigned int x1 = x & (stride1 - 1);
++ const unsigned int x2 = x ^ x1;
++
++ return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++ const unsigned int x1 = x & (stride1 - 1);
++ const unsigned int x2 = x ^ x1;
++
++ return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++
+--- /dev/null
++++ b/pi-util/BUILD.txt
+@@ -0,0 +1,59 @@
++Building Pi FFmpeg
++==================
++
++Current only building on a Pi is supported.
++This builds ffmpeg the way I've tested it
++
++Get all dependencies - the current package dependencies are good enough
++
++$ sudo apt-get build-dep ffmpeg
++
++Configure using the pi-util/conf_native.sh script
++-------------------------------------------------
++
++This sets the normal release options and creates an ouutput dir to build into
++The directory name will depend on system and options but will be under out/
++
++There are a few choices here
++ --mmal build including the legacy mmal-based decoders and zero-copy code
++ this requires appropriate libraries which currently will exist for
++ armv7 but not arm64
++ --noshared
++ Build a static image rather than a shared library one. Static is
++ easier for testing as there is no need to worry about library
++ paths being confused and therefore running the wrong code, Shared
++ is what is needed, in most cases, when building for use by other
++ programs.
++
++So for a static build
++---------------------
++
++$ pi-util/conf_native.sh --noshared
++
++$ make -j8 -C out/<wherever the script said it was building to>
++
++You can now run ffmpeg directly from where it was built
++
++For a shared build
++------------------
++
++$ pi-util/conf_native.sh
++
++You will normally want an install target if shared. Note that the script has
++set this up to be generated in out/<builddir>/install, you don't have to worry
++about overwriting your system libs.
++
++$ make -j8 -C out/<builddir> install
++
++You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
++built or install the image on the system - you have to be careful to get rid
++of all other ffmpeg libs or confusion may result. There is a little script
++that wipes all other versions - obviously use with care!
++
++$ sudo pi-util/clean_usr_libs.sh
++
++Then simply copying from the install to /usr works
++
++$ sudo cp -r out/<builddir>/install/* /usr
++
++
+--- /dev/null
++++ b/pi-util/NOTES.txt
+@@ -0,0 +1,69 @@
++Notes on the hevc_rpi decoder & associated support code
++-------------------------------------------------------
++
++There are 3 main parts to the existing code:
++
++1) The decoder - this is all in libavcodec as rpi_hevc*.
++
++2) A few filters to deal with Sand frames and a small patch to
++automatically select the sand->i420 converter when required.
++
++3) A kludge in ffmpeg.c to display the decoded video. This could & should
++be converted into a proper ffmpeg display module.
++
++
++Decoder
++-------
++
++The decoder is a modified version of the existing ffmpeg hevc decoder.
++Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
++More complex bitstreams can be up to ~200% faster but particularly easy
++streams can cut its advantage down to ~50%. This means that a Pi3+ can
++display nearly all 8-bit 1080p30 streams and with some overclocking it can
++display most lower bitrate 10-bit 1080p30 streams - this latter case is
++not helped by the requirement to downsample to 8-bit before display on a
++Pi.
++
++It has had co-processor offload added for inter-pred and large block
++residual transform. Various parts have had optimized ARM NEON assembler
++added and the existing ARM asm sections have been profiled and
++re-optimized for A53. The main C code has been substantially reworked at
++its lower levels in an attempt to optimize it and minimize memory
++bandwidth. To some extent code paths that deal with frame types that it
++doesn't support have been pruned.
++
++It outputs frames in Broadcom Sand format. This is a somewhat annoying
++layout that doesn't fit into ffmpegs standard frame descriptions. It has
++vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
++the stripe followed by interleaved U & V, that is then followed by the Y
++for the next stripe, etc. The final stripe is always padded to
++stripe-width. This is used in an attempt to help with cache locality and
++cut down on the number of dram bank switches. It is annoying to use for
++inter-pred with conventional processing but the way the Pi QPU (which is
++used for inter-pred) works means that it has negligible downsides here and
++the improved memory performance exceeds the overhead of the increased
++complexity in the rest of the code.
++
++Frames must be allocated out of GPU memory (as otherwise they can't be
++accessed by the co-processors). Utility functions (in rpi_zc.c) have been
++written to make this easier. As the frames are already in GPU memory they
++can be displayed by the Pi h/w without any further copying.
++
++
++Known non-features
++------------------
++
++Frame allocation should probably be done in some other way in order to fit
++into the standard framework better.
++
++Sand frames are currently declared as software frames, there is an
++argument that they should be hardware frames but they aren't really.
++
++There must be a better way of auto-selecting the hevc_rpi decoder over the
++normal s/w hevc decoder, but I became confused by the existing h/w
++acceleration framework and what I wanted to do didn't seem to fit in
++neatly.
++
++Display should be a proper device rather than a kludge in ffmpeg.c
++
++
+--- /dev/null
++++ b/pi-util/TESTMESA.txt
+@@ -0,0 +1,82 @@
++# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
++
++# These assume that the drm_mmal test for Sand8 has been built on this Pi
++# as build relies on many of the same files
++
++# 1st get everything required to build ffmpeg
++# If sources aren't already enabled on your Pi then enable them
++sudo su
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
++mv /tmp/sources.list /etc/apt/
++mv /tmp/raspi.list /etc/apt/sources.list.d/
++apt update
++
++# Get dependancies
++sudo apt build-dep ffmpeg
++
++sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
++
++# Enable H265 V4L2 request decoder
++sudo su
++echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
++# You may also want to add more CMA if you are going to try 4k videos
++# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
++# dtoverlay=vc4-fkms-v3d,cma-512
++reboot
++# Check it has turned up
++ls -la /dev/video*
++# This should include video19
++# crw-rw----+ 1 root video 81, 7 Aug 4 17:25 /dev/video19
++
++# Currently on the Pi the linux headers from the debian distro don't match
++# the kernel that we ship and we need to update them - hopefully this step
++# will be unneeded in the future
++sudo apt install git bc bison flex libssl-dev make
++git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
++cd linux
++KERNEL=kernel7l
++make bcm2711_defconfig
++make headers_install
++sudo cp -r usr/include/linux /usr/include
++cd ..
++
++# Config - this builds a staticly linked ffmpeg which is easier for testing
++pi-util/conf_native.sh --noshared
++
++# Build (this is a bit dull)
++# If you want to poke the source the libavdevice/egl_vout.c contains the
++# output code -
++cd out/armv7-static-rel
++
++# Check that you have actually configured V4L2 request
++grep HEVC_V4L2REQUEST config.h
++# You are hoping for
++# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
++# if you get 0 then the config has failed
++
++make -j6
++
++# Grab test streams
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
++
++# Test i420 output (works currently)
++./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
++
++# Test Sand8 output - doesn't currently work but should once you have
++# Sand8 working in drm_mmal. I can't guarantee that this will work as
++# I can't test this path with a known working format, but the debug looks
++# good. If this doesn't work & drm_mmal does with sand8 then come back to me
++# The "show_all 1" forces vout to display every frame otherwise it drops any
++# frame that would cause it to block
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
++
++# Test Sand30 - doesn't currently work
++# (Beware that when FFmpeg errors out it often leaves your teminal window
++# in a state where you need to reset it)
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
++
++
++
+--- /dev/null
++++ b/pi-util/clean_usr_libs.sh
+@@ -0,0 +1,26 @@
++set -e
++U=/usr/lib/arm-linux-gnueabihf
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++U=/usr/lib/arm-linux-gnueabihf/neon/vfp
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++U=/usr/lib/aarch64-linux-gnu
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++
+--- /dev/null
++++ b/pi-util/conf_arm64_native.sh
+@@ -0,0 +1,45 @@
++echo "Configure for ARM64 native build"
++
++#RPI_KEEPS="-save-temps=obj"
++
++SHARED_LIBS="--enable-shared"
++if [ "$1" == "--noshared" ]; then
++ SHARED_LIBS="--disable-shared"
++ echo Static libs
++ OUT=out/arm64-static-rel
++else
++ echo Shared libs
++ OUT=out/arm64-shared-rel
++fi
++
++mkdir -p $OUT
++cd $OUT
++
++A=aarch64-linux-gnu
++USR_PREFIX=`pwd`/install
++LIB_PREFIX=$USR_PREFIX/lib/$A
++INC_PREFIX=$USR_PREFIX/include/$A
++
++../../configure \
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ --disable-stripping\
++ --disable-thumb\
++ --disable-mmal\
++ --enable-sand\
++ --enable-v4l2-request\
++ --enable-libdrm\
++ --enable-epoxy\
++ --enable-libudev\
++ --enable-vout-drm\
++ --enable-vout-egl\
++ $SHARED_LIBS\
++ --extra-cflags="-ggdb"
++
++# --enable-decoder=hevc_rpi\
++# --enable-extra-warnings\
++# --arch=armv71\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+--- /dev/null
++++ b/pi-util/conf_h265.2016.csv
+@@ -0,0 +1,195 @@
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
++1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
++0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
++1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
++1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
+--- /dev/null
++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
+@@ -0,0 +1,147 @@
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+--- /dev/null
++++ b/pi-util/conf_h265.csv
+@@ -0,0 +1,144 @@
++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+--- /dev/null
++++ b/pi-util/conf_native.sh
+@@ -0,0 +1,106 @@
++echo "Configure for native build"
++
++FFSRC=`pwd`
++MC=`dpkg --print-architecture`
++BUILDBASE=$FFSRC/out
++
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++NOSHARED=
++MMAL=
++
++while [ "$1" != "" ] ; do
++ case $1 in
++ --noshared)
++ NOSHARED=1
++ ;;
++ --mmal)
++ MMAL=1
++ ;;
++ *)
++ echo "Usage $0: [--noshared] [--mmal]"
++ exit 1
++ ;;
++ esac
++ shift
++done
++
++
++MCOPTS=
++RPI_INCLUDES=
++RPI_LIBDIRS=
++RPI_DEFINES=
++RPI_EXTRALIBS=
++
++if [ "$MC" == "arm64" ]; then
++ echo "M/C aarch64"
++ A=aarch64-linux-gnu
++ B=arm64
++elif [ "$MC" == "armhf" ]; then
++ echo "M/C armv7"
++ A=arm-linux-gnueabihf
++ B=armv7
++ MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
++ RPI_DEFINES=-mfpu=neon-vfpv4
++else
++ echo Unexpected architecture $MC
++ exit 1
++fi
++
++if [ $MMAL ]; then
++ RPI_OPT_VC=/opt/vc
++ RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++ RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
++ RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
++ RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
++ RPIOPTS="--enable-mmal --enable-rpi"
++else
++ RPIOPTS="--disable-mmal --enable-sand"
++fi
++
++C=`lsb_release -sc`
++V=`cat RELEASE`
++
++SHARED_LIBS="--enable-shared"
++if [ $NOSHARED ]; then
++ SHARED_LIBS="--disable-shared"
++ OUT=$BUILDBASE/$B-$C-$V-static-rel
++ echo Static libs
++else
++ echo Shared libs
++ OUT=$BUILDBASE/$B-$C-$V-shared-rel
++fi
++
++USR_PREFIX=$OUT/install
++LIB_PREFIX=$USR_PREFIX/lib/$A
++INC_PREFIX=$USR_PREFIX/include/$A
++
++echo Destination directory: $OUT
++mkdir -p $OUT
++# Nothing under here need worry git - including this .gitignore!
++echo "**" > $BUILDBASE/.gitignore
++cd $OUT
++
++$FFSRC/configure \
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ $MCOPTS\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-v4l2-request\
++ --enable-libdrm\
++ --enable-vout-egl\
++ --enable-vout-drm\
++ $SHARED_LIBS\
++ $RPIOPTS\
++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS"\
++ --extra-libs="$RPI_EXTRALIBS"\
++ --extra-version="rpi"
++
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+--- /dev/null
++++ b/pi-util/ffconf.py
+@@ -0,0 +1,215 @@
++#!/usr/bin/env python3
++
++import string
++import os
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++CODEC_HEVC_RPI = 1
++HWACCEL_RPI = 2
++HWACCEL_DRM = 3
++HWACCEL_VAAPI = 4
++
++def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec):
++ hwaccel = ""
++ if dectype == HWACCEL_RPI:
++ hwaccel = "rpi"
++ elif dectype == HWACCEL_DRM:
++ hwaccel = "drm"
++ elif dectype == HWACCEL_VAAPI:
++ hwaccel = "vaapi"
++
++ pix_fmt = []
++ if pix == "8":
++ pix_fmt = ["-pix_fmt", "yuv420p"]
++ elif pix == "10":
++ pix_fmt = ["-pix_fmt", "yuv420p10le"]
++ elif pix == "12":
++ pix_fmt = ["-pix_fmt", "yuv420p12le"]
++
++ tmp_root = "/tmp"
++
++ names = srcname.split('/')
++ while len(names) > 1:
++ tmp_root = os.path.join(tmp_root, names[0])
++ del names[0]
++ name = names[0]
++
++ if not os.path.exists(tmp_root):
++ os.makedirs(tmp_root)
++
++ dec_file = os.path.join(tmp_root, name + ".dec.md5")
++ try:
++ os.remove(dec_file)
++ except:
++ pass
++
++ flog = open(os.path.join(tmp_root, name + ".log"), "wt")
++
++ ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
++
++ # Unaligned needed for cropping conformance
++ if hwaccel:
++ rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
++ else:
++ rstr = subprocess.call(
++ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
++ stdout=flog, stderr=subprocess.STDOUT)
++
++ try:
++ m1 = None
++ m2 = None
++ with open(os.path.join(fileroot, md5_file)) as f:
++ for line in f:
++ m1 = re.search("[0-9a-f]{32}", line.lower())
++ if m1:
++ break
++
++ with open(dec_file) as f:
++ m2 = re.search("[0-9a-f]{32}", f.readline())
++ except:
++ pass
++
++ if m1 and m2 and m1.group() == m2.group():
++ print("Match: " + m1.group(), file=flog)
++ rv = 0
++ elif not m1:
++ print("****** Cannot find m1", file=flog)
++ rv = 3
++ elif not m2:
++ print("****** Cannot find m2", file=flog)
++ rv = 2
++ else:
++ print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
++ rv = 1
++ flog.close()
++ return rv
++
++def scandir(root):
++ aconf = []
++ ents = os.listdir(root)
++ ents.sort(key=str.lower)
++ for name in ents:
++ test_path = os.path.join(root, name)
++ if S_ISDIR(os.stat(test_path).st_mode):
++ files = os.listdir(test_path)
++ es_file = "?"
++ md5_file = "?"
++ for f in files:
++ (base, ext) = os.path.splitext(f)
++ if base[0] == '.':
++ pass
++ elif ext == ".bit" or ext == ".bin":
++ es_file = f
++ elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
++ if md5_file == "?":
++ md5_file = f
++ elif base[-3:] == "yuv":
++ md5_file = f
++ aconf.append((1, name, es_file, md5_file))
++ return aconf
++
++def runtest(name, tests):
++ if not tests:
++ return True
++ for t in tests:
++ if name[0:len(t)] == t or name.find("/" + t) != -1:
++ return True
++ return False
++
++def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec):
++ unx_failures = []
++ unx_success = []
++ failures = 0
++ successes = 0
++ for a in csva:
++ exp_test = int(a[0])
++ if (exp_test and runtest(a[1], tests)):
++ name = a[1]
++ print ("==== ", name, end="")
++ sys.stdout.flush()
++
++ rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
++ if (rv == 0):
++ successes += 1
++ else:
++ failures += 1
++
++ if (rv == 0):
++ if exp_test == 2:
++ print(": * OK *")
++ unx_success.append(name)
++ else:
++ print(": ok")
++ elif exp_test == 2 and rv == 1:
++ print(": fail")
++ elif exp_test == 3 and rv == 2:
++ # Call an expected "crash" an abort
++ print(": abort")
++ else:
++ unx_failures.append(name)
++ if rv == 1:
++ print(": * FAIL *")
++ elif (rv == 2) :
++ print(": * CRASH *")
++ elif (rv == 3) :
++ print(": * MD5 MISSING *")
++ else :
++ print(": * BANG *")
++
++ if unx_failures or unx_success:
++ print("Unexpected Failures:", unx_failures)
++ print("Unexpected Success: ", unx_success)
++ else:
++ print("All tests normal:", successes, "ok,", failures, "failed")
++
++
++class ConfCSVDialect(csv.Dialect):
++ delimiter = ','
++ doublequote = True
++ lineterminator = '\n'
++ quotechar='"'
++ quoting = csv.QUOTE_MINIMAL
++ skipinitialspace = True
++ strict = True
++
++if __name__ == '__main__':
++
++ argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
++ argp.add_argument("tests", nargs='*')
++ argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
++ argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
++ argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
++ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
++ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
++ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
++ argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
++ argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
++ args = argp.parse_args()
++
++ if args.csvgen:
++ csv.writer(sys.stdout).writerows(scandir(args.test_root))
++ exit(0)
++
++ with open(args.csv, 'rt') as csvfile:
++ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
++
++ dectype = CODEC_HEVC_RPI
++ if os.path.exists("/dev/rpivid-hevcmem"):
++ dectype = HWACCEL_RPI
++ if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
++ dectype = HWACCEL_DRM
++
++ if args.pi4:
++ dectype = HWACCEL_RPI
++ elif args.drm:
++ dectype = HWACCEL_DRM
++ elif args.vaapi:
++ dectype = HWACCEL_VAAPI
++
++ doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg)
++
+--- /dev/null
++++ b/pi-util/ffperf.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++ close_threshold = 0.01
++
++ def __init__(self, stats_dict=None):
++ if stats_dict != None:
++ self.name = stats_dict["name"]
++ self.elapsed = float(stats_dict["elapsed"])
++ self.user = float(stats_dict["user"])
++ self.sys = float(stats_dict["sys"])
++
++ def times_str(self):
++ ctime = self.sys + self.user
++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++ def dict(self):
++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++ def is_close(self, other):
++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++ def __lt__(self, other):
++ return self.elapsed < other.elapsed
++ def __gt__(self, other):
++ return self.elapsed > other.elapsed
++
++ def time_file(name, prefix, ffmpeg="./ffmpeg"):
++ stats = tstats()
++ stats.name = name
++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw",
++ "-vcodec", "hevc_rpi",
++ "-t", "30", "-i", prefix + name,
++ "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++ pinfo = os.wait4(cproc.pid, 0)
++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ stats.elapsed = end_time - start_time
++ stats.user = pinfo[2].ru_utime
++ stats.sys = pinfo[2].ru_stime
++ return stats
++
++
++def common_prefix(s1, s2):
++ for i in range(min(len(s1),len(s2))):
++ if s1[i] != s2[i]:
++ return s1[:i]
++ return s1[:i+1]
++
++def main():
++ global flog
++
++ argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
++To blank the screen before starting use "xdg-screensaver activate"
++(For some reason this doesn't seem to work from within python).
++""")
++
++ argp.add_argument("streams", nargs='*')
++ argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
++ argp.add_argument("--csv_in", help="CSV input filename")
++ argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
++ argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
++ argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable")
++
++ args = argp.parse_args()
++
++ csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
++ csv_out.writeheader()
++
++ stats_in = {}
++ if args.csv_in != None:
++ with open(args.csv_in, 'r', newline='') as f_in:
++ stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++ flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
++
++ streams = args.streams
++ if not streams:
++ if not stats_in:
++ print ("No source streams specified")
++ return 1
++ prefix = "" if args.prefix == None else args.prefix
++ streams = [k for k in stats_in]
++ elif args.prefix != None:
++ prefix = args.prefix
++ else:
++ prefix = streams[0]
++ for f in streams[1:]:
++ prefix = common_prefix(prefix, f)
++ pp = prefix.rpartition(os.sep)
++ prefix = pp[0] + pp[1]
++ streams = [s[len(prefix):] for s in streams]
++
++ for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
++ print ("====", f)
++
++ t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
++ for i in range(args.repeat):
++ t = tstats.time_file(f, prefix, args.ffmpeg)
++ print ("...", t.times_str())
++ if t0 > t:
++ t0 = t
++
++ if t0.name in stats_in:
++ pstat = stats_in[t0.name]
++ print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
++
++ csv_out.writerow(t0.dict())
++
++ print ()
++
++ return 0
++
++
++if __name__ == '__main__':
++ exit(main())
++
+--- /dev/null
++++ b/pi-util/genpatch.sh
+@@ -0,0 +1,35 @@
++set -e
++
++NOPATCH=
++if [ "$1" == "--notag" ]; then
++ shift
++ NOPATCH=1
++fi
++
++if [ "$1" == "" ]; then
++ echo Usage: $0 [--notag] \<patch_tag\>
++ echo e.g.: $0 mmal_4
++ exit 1
++fi
++
++VERSION=`cat RELEASE`
++if [ "$VERSION" == "" ]; then
++ echo Can\'t find version RELEASE
++ exit 1
++fi
++
++PATCHFILE=../ffmpeg-$VERSION-$1.patch
++
++if [ $NOPATCH ]; then
++ echo Not tagged
++else
++ # Only continue if we are all comitted
++ git diff --name-status --exit-code
++
++ PATCHTAG=pi/$VERSION/$1
++ echo Tagging: $PATCHTAG
++
++ git tag $PATCHTAG
++fi
++echo Generating patch: $PATCHFILE
++git diff n$VERSION -- > $PATCHFILE
+--- /dev/null
++++ b/pi-util/make_array.py
+@@ -0,0 +1,23 @@
++#!/usr/bin/env python
++
++# Usage
++# make_array file.bin
++# Produces file.h with array of bytes.
++#
++import sys
++for file in sys.argv[1:]:
++ prefix,suffix = file.split('.')
++ assert suffix=='bin'
++ name=prefix.split('/')[-1]
++ print 'Converting',file
++ with open(prefix+'.h','wb') as out:
++ print >>out, 'static const unsigned char',name,'[] = {'
++ with open(file,'rb') as fd:
++ i = 0
++ for byte in fd.read():
++ print >>out, '0x%02x, ' % ord(byte),
++ i = i + 1
++ if i % 8 == 0:
++ print >>out, ' // %04x' % (i - 8)
++ print >>out,'};'
++
+--- /dev/null
++++ b/pi-util/mkinst.sh
+@@ -0,0 +1,5 @@
++set -e
++
++make install
++
++cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
+--- /dev/null
++++ b/pi-util/patkodi.sh
+@@ -0,0 +1,9 @@
++set -e
++KODIBASE=/home/jc/rpi/kodi/xbmc
++JOBS=-j20
++make $JOBS
++git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS
++make -C $KODIBASE/build install
++
++
+--- /dev/null
++++ b/pi-util/perfcmp.py
+@@ -0,0 +1,101 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++ close_threshold = 0.01
++
++ def __init__(self, stats_dict=None):
++ if stats_dict != None:
++ self.name = stats_dict["name"]
++ self.elapsed = float(stats_dict["elapsed"])
++ self.user = float(stats_dict["user"])
++ self.sys = float(stats_dict["sys"])
++
++ def times_str(self):
++ ctime = self.sys + self.user
++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++ def dict(self):
++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++ def is_close(self, other):
++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++ def __lt__(self, other):
++ return self.elapsed < other.elapsed
++ def __gt__(self, other):
++ return self.elapsed > other.elapsed
++
++ def time_file(name, prefix):
++ stats = tstats()
++ stats.name = name
++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
++ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++ pinfo = os.wait4(cproc.pid, 0)
++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ stats.elapsed = end_time - start_time
++ stats.user = pinfo[2].ru_utime
++ stats.sys = pinfo[2].ru_stime
++ return stats
++
++
++def common_prefix(s1, s2):
++ for i in range(min(len(s1),len(s2))):
++ if s1[i] != s2[i]:
++ return s1[:i]
++ return s1[:i+1]
++
++def main():
++ argp = argparse.ArgumentParser(description="FFmpeg performance compare")
++
++ argp.add_argument("stream0", help="CSV to compare")
++ argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
++
++ args = argp.parse_args()
++
++ with open(args.stream0, 'r', newline='') as f_in:
++ stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++ with open(args.stream1, 'r', newline='') as f_in:
++ stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++ print (args.stream0, "<<-->>", args.stream1)
++ print ()
++
++ for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
++ if not (f in stats0) :
++ print (" XX :", f)
++ continue
++ if not (f in stats1) :
++ print (" XX :", f)
++ continue
++
++ s0 = stats0[f]
++ s1 = stats1[f]
++
++ pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
++ thresh = 0.3
++ tc = 6
++
++ nchar = min(tc - 1, int(abs(pcent) / thresh))
++ cc = " -- " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
++
++ print ("%6.2f %s%6.2f (%+5.2f) : %s" %
++ (s0.elapsed, cc, s1.elapsed, pcent, f))
++
++ return 0
++
++
++if __name__ == '__main__':
++ exit(main())
++
+--- /dev/null
++++ b/pi-util/qem.sh
+@@ -0,0 +1,9 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ ../local/bin/qasm.py
++SRC_FILE=libavcodec/rpi_hevc_shader.qasm
++DST_BASE=shader
++
++cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
+--- /dev/null
++++ b/pi-util/v3dusage.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def do_logparse(logname):
++
++ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++ rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
++ rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
++ rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
++
++ ttotal = {'idle':0.0}
++ tstart = {}
++ qctotal = {}
++ qtstotal = {}
++ l2hits = {}
++ l2total = {}
++ time0 = None
++ idle_start = None
++ qpu_op_no = 0
++ op_count = 0
++
++ with open(logname, "rt") as infile:
++ for line in infile:
++ match = rmatch.match(line)
++ if match:
++# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++ time = float(match.group(1))
++ unit = match.group(3)
++ opstart = not match.group(2)
++ optype = match.group(7)
++ hascb = match.group(8) != "0"
++
++ if unit == 'qpu1':
++ unit = unit + "." + str(qpu_op_no)
++ if not opstart:
++ if hascb or optype == 'EXECUTE_SYNC':
++ qpu_op_no = 0
++ else:
++ qpu_op_no += 1
++
++ # Ignore sync type
++ if optype == 'EXECUTE_SYNC':
++ continue
++
++ if not time0:
++ time0 = time
++
++ if opstart:
++ tstart[unit] = time;
++ elif unit in tstart:
++ op_count += 1
++ if not unit in ttotal:
++ ttotal[unit] = 0.0
++ ttotal[unit] += time - tstart[unit]
++ del tstart[unit]
++
++ if not idle_start and not tstart:
++ idle_start = time
++ elif idle_start and tstart:
++ ttotal['idle'] += time - idle_start
++ idle_start = None
++
++ match = rqcycle.match(line)
++ if match:
++ unit = "qpu1." + str(qpu_op_no)
++ if not unit in qctotal:
++ qctotal[unit] = 0
++ qctotal[unit] += int(match.group(2))
++
++ match = rqtscycle.match(line)
++ if match:
++ unit = "qpu1." + str(qpu_op_no)
++ if not unit in qtstotal:
++ qtstotal[unit] = 0
++ qtstotal[unit] += int(match.group(2))
++
++ match = rl2hits.match(line)
++ if match:
++ unit = "qpu1." + str(qpu_op_no)
++ if not unit in l2total:
++ l2total[unit] = 0
++ l2hits[unit] = 0
++ l2total[unit] += int(match.group(3))
++ if match.group(2) == "hits":
++ l2hits[unit] += int(match.group(3))
++
++
++ if not time0:
++ print "No v3d profile records found"
++ else:
++ tlogged = time - time0
++
++ print "Logged time:", tlogged, " Op count:", op_count
++ for unit in sorted(ttotal):
++ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++ print
++ for unit in sorted(qctotal):
++ if not unit in qtstotal:
++ qtstotal[unit] = 0;
++ print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
++ if unit in l2total:
++ print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
++
++
++
++if __name__ == '__main__':
++ argp = argparse.ArgumentParser(
++ formatter_class=argparse.RawDescriptionHelpFormatter,
++ description="QPU/VPU perf summary from VC logging",
++ epilog = """
++Will also summarise TMU stalls if logging requests set in qpu noflush param
++in the profiled code.
++
++Example use:
++ vcgencmd set_logging level=0xc0
++ <command to profile>
++ sudo vcdbg log msg >& t.log
++ v3dusage.py t.log
++""")
++
++ argp.add_argument("logfile")
++ args = argp.parse_args()
++
++ do_logparse(args.logfile)
++
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)
+ AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o
+ AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o
+ AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o
++AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o
++AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o
+ AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o
+ AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o
+
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -121,6 +121,9 @@ static const struct {
+ #if CONFIG_HUFFYUV_DECODER
+ { "huffyuvdsp", checkasm_check_huffyuvdsp },
+ #endif
++ #if CONFIG_IDCTDSP
++ { "idctdsp", checkasm_check_idctdsp },
++ #endif
+ #if CONFIG_JPEG2000_DECODER
+ { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
+ #endif
+@@ -145,6 +148,9 @@ static const struct {
+ #if CONFIG_V210_ENCODER
+ { "v210enc", checkasm_check_v210enc },
+ #endif
++ #if CONFIG_VC1DSP
++ { "vc1dsp", checkasm_check_vc1dsp },
++ #endif
+ #if CONFIG_VP8DSP
+ { "vp8dsp", checkasm_check_vp8dsp },
+ #endif
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -60,6 +60,7 @@ void checkasm_check_hevc_add_res(void);
+ void checkasm_check_hevc_idct(void);
+ void checkasm_check_hevc_sao(void);
+ void checkasm_check_huffyuvdsp(void);
++void checkasm_check_idctdsp(void);
+ void checkasm_check_jpeg2000dsp(void);
+ void checkasm_check_llviddsp(void);
+ void checkasm_check_llviddspenc(void);
+@@ -73,6 +74,7 @@ void checkasm_check_sw_scale(void);
+ void checkasm_check_utvideodsp(void);
+ void checkasm_check_v210dec(void);
+ void checkasm_check_v210enc(void);
++void checkasm_check_vc1dsp(void);
+ void checkasm_check_vf_eq(void);
+ void checkasm_check_vf_gblur(void);
+ void checkasm_check_vf_hflip(void);
+--- /dev/null
++++ b/tests/checkasm/idctdsp.c
+@@ -0,0 +1,98 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/idctdsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
++
++typedef struct {
++ const char *name;
++ size_t offset;
++} test;
++
++#define RANDOMIZE_BUFFER16(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint16_t r = rnd() % 0x201 - 0x100; \
++ AV_WN16A(name##0 + i, r); \
++ AV_WN16A(name##1 + i, r); \
++ } \
++ } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint8_t r = rnd(); \
++ name##0[i] = r; \
++ name##1[i] = r; \
++ } \
++ } while (0)
++
++static void check_add_put_clamped(void)
++{
++ /* Source buffers are only as big as needed, since any over-read won't affect results */
++ LOCAL_ALIGNED_16(int16_t, src0, [64]);
++ LOCAL_ALIGNED_16(int16_t, src1, [64]);
++ /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
++ LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
++ LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
++
++ AVCodecContext avctx = { 0 };
++ IDCTDSPContext h;
++
++ const test tests[] = {
++ IDCTDSP_TEST(add_pixels_clamped)
++ IDCTDSP_TEST(put_pixels_clamped)
++ IDCTDSP_TEST(put_signed_pixels_clamped)
++ };
++
++ ff_idctdsp_init(&h, &avctx);
++
++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++ void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
++ if (check_func(func, "idctdsp.%s", tests[t].name)) {
++ declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
++ RANDOMIZE_BUFFER16(src, 64);
++ RANDOMIZE_BUFFER8(dst, 10 * 24);
++ call_ref(src0, dst0 + 24 + 8, 24);
++ call_new(src1, dst1 + 24 + 8, 24);
++ if (memcmp(dst0, dst1, 10 * 24))
++ fail();
++ bench_new(src1, dst1 + 24 + 8, 24);
++ }
++ }
++}
++
++void checkasm_check_idctdsp(void)
++{
++ check_add_put_clamped();
++ report("idctdsp");
++}
+--- /dev/null
++++ b/tests/checkasm/vc1dsp.c
+@@ -0,0 +1,452 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/vc1dsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
++
++typedef struct {
++ const char *name;
++ size_t offset;
++ int width;
++ int height;
++} test;
++
++typedef struct matrix {
++ size_t width;
++ size_t height;
++ float d[];
++} matrix;
++
++static const matrix T8 = { 8, 8, {
++ 12, 12, 12, 12, 12, 12, 12, 12,
++ 16, 15, 9, 4, -4, -9, -15, -16,
++ 16, 6, -6, -16, -16, -6, 6, 16,
++ 15, -4, -16, -9, 9, 16, 4, -15,
++ 12, -12, -12, 12, 12, -12, -12, 12,
++ 9, -16, 4, 15, -15, -4, 16, -9,
++ 6, -16, 16, -6, -6, 16, -16, 6,
++ 4, -9, 15, -16, 16, -15, 9, -4
++} };
++
++static const matrix T4 = { 4, 4, {
++ 17, 17, 17, 17,
++ 22, 10, -10, -22,
++ 17, -17, -17, 17,
++ 10, -22, 22, -10
++} };
++
++static const matrix T8t = { 8, 8, {
++ 12, 16, 16, 15, 12, 9, 6, 4,
++ 12, 15, 6, -4, -12, -16, -16, -9,
++ 12, 9, -6, -16, -12, 4, 16, 15,
++ 12, 4, -16, -9, 12, 15, -6, -16,
++ 12, -4, -16, 9, 12, -15, -6, 16,
++ 12, -9, -6, 16, -12, -4, 16, -15,
++ 12, -15, 6, 4, -12, 16, -16, 9,
++ 12, -16, 16, -15, 12, -9, 6, -4
++} };
++
++static const matrix T4t = { 4, 4, {
++ 17, 22, 17, 10,
++ 17, 10, -17, -22,
++ 17, -10, -17, 22,
++ 17, -22, 17, -10
++} };
++
++static matrix *new_matrix(size_t width, size_t height)
++{
++ matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
++ if (out == NULL) {
++ fprintf(stderr, "Memory allocation failure\n");
++ exit(EXIT_FAILURE);
++ }
++ out->width = width;
++ out->height = height;
++ return out;
++}
++
++static matrix *multiply(const matrix *a, const matrix *b)
++{
++ matrix *out;
++ if (a->width != b->height) {
++ fprintf(stderr, "Incompatible multiplication\n");
++ exit(EXIT_FAILURE);
++ }
++ out = new_matrix(b->width, a->height);
++ for (int j = 0; j < out->height; ++j)
++ for (int i = 0; i < out->width; ++i) {
++ float sum = 0;
++ for (int k = 0; k < a->width; ++k)
++ sum += a->d[j * a->width + k] * b->d[k * b->width + i];
++ out->d[j * out->width + i] = sum;
++ }
++ return out;
++}
++
++static void normalise(matrix *a)
++{
++ for (int j = 0; j < a->height; ++j)
++ for (int i = 0; i < a->width; ++i) {
++ float *p = a->d + j * a->width + i;
++ *p *= 64;
++ if (a->height == 4)
++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
++ else
++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
++ if (a->width == 4)
++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
++ else
++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
++ }
++}
++
++static void divide_and_round_nearest(matrix *a, float by)
++{
++ for (int j = 0; j < a->height; ++j)
++ for (int i = 0; i < a->width; ++i) {
++ float *p = a->d + j * a->width + i;
++ *p = rintf(*p / by);
++ }
++}
++
++static void tweak(matrix *a)
++{
++ for (int j = 4; j < a->height; ++j)
++ for (int i = 0; i < a->width; ++i) {
++ float *p = a->d + j * a->width + i;
++ *p += 1;
++ }
++}
++
++/* The VC-1 spec places restrictions on the values permitted at three
++ * different stages:
++ * - D: the input coefficients in frequency domain
++ * - E: the intermediate coefficients, inverse-transformed only horizontally
++ * - R: the fully inverse-transformed coefficients
++ *
++ * To fully cater for the ranges specified requires various intermediate
++ * values to be held to 17-bit precision; yet these conditions do not appear
++ * to be utilised in real-world streams. At least some assembly
++ * implementations have chosen to restrict these values to 16-bit precision,
++ * to accelerate the decoding of real-world streams at the cost of strict
++ * adherence to the spec. To avoid our test marking these as failures,
++ * reduce our random inputs.
++ */
++#define ATTENUATION 4
++
++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
++{
++ matrix *raw, *tmp, *D, *E, *R;
++ raw = new_matrix(width, height);
++ for (int i = 0; i < width * height; ++i)
++ raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
++ tmp = multiply(height == 8 ? &T8 : &T4, raw);
++ D = multiply(tmp, width == 8 ? &T8t : &T4t);
++ normalise(D);
++ divide_and_round_nearest(D, 1);
++ for (int i = 0; i < width * height; ++i) {
++ if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
++ /* Rare, so simply try again */
++ av_free(raw);
++ av_free(tmp);
++ av_free(D);
++ return generate_inverse_quantized_transform_coefficients(width, height);
++ }
++ }
++ E = multiply(D, width == 8 ? &T8 : &T4);
++ divide_and_round_nearest(E, 8);
++ for (int i = 0; i < width * height; ++i)
++ if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
++ /* Rare, so simply try again */
++ av_free(raw);
++ av_free(tmp);
++ av_free(D);
++ av_free(E);
++ return generate_inverse_quantized_transform_coefficients(width, height);
++ }
++ R = multiply(height == 8 ? &T8t : &T4t, E);
++ tweak(R);
++ divide_and_round_nearest(R, 128);
++ for (int i = 0; i < width * height; ++i)
++ if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
++ /* Rare, so simply try again */
++ av_free(raw);
++ av_free(tmp);
++ av_free(D);
++ av_free(E);
++ av_free(R);
++ return generate_inverse_quantized_transform_coefficients(width, height);
++ }
++ av_free(raw);
++ av_free(tmp);
++ av_free(E);
++ av_free(R);
++ return D;
++}
++
++#define RANDOMIZE_BUFFER16(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint16_t r = rnd(); \
++ AV_WN16A(name##0 + i, r); \
++ AV_WN16A(name##1 + i, r); \
++ } \
++ } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint8_t r = rnd(); \
++ name##0[i] = r; \
++ name##1[i] = r; \
++ } \
++ } while (0)
++
++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \
++ do { \
++ uint8_t *p##0 = name##0, *p##1 = name##1; \
++ int i = (size); \
++ while (i-- > 0) { \
++ int x = 0x80 | (rnd() & 0x7F); \
++ x >>= rnd() % 9; \
++ if (rnd() & 1) \
++ x = -x; \
++ *p##1++ = *p##0++ = 0x80 + x; \
++ } \
++ } while (0)
++
++static void check_inv_trans_inplace(void)
++{
++ /* Inverse transform input coefficients are stored in a 16-bit buffer
++ * with row stride of 8 coefficients irrespective of transform size.
++ * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
++ * are stored in column-major order, and the outputs are written back
++ * to the input buffer, so we oversize it slightly to catch overruns. */
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
++
++ VC1DSPContext h;
++
++ ff_vc1dsp_init(&h);
++
++ if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
++ matrix *coeffs;
++ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
++ RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
++ coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
++ for (int j = 0; j < 8; ++j)
++ for (int i = 0; i < 8; ++i) {
++ int idx = 8 + i * 8 + j;
++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
++ }
++ call_ref(inv_trans_in0 + 8);
++ call_new(inv_trans_in1 + 8);
++ if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t)))
++ fail();
++ bench_new(inv_trans_in1 + 8);
++ av_free(coeffs);
++ }
++}
++
++static void check_inv_trans_adding(void)
++{
++ /* Inverse transform input coefficients are stored in a 16-bit buffer
++ * with row stride of 8 coefficients irrespective of transform size. */
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
++
++ /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
++ * added with saturation to an array of unsigned 8-bit values. Oversize
++ * this by 8 samples left and right and one row above and below. */
++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
++
++ VC1DSPContext h;
++
++ const test tests[] = {
++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
++ };
++
++ ff_vc1dsp_init(&h);
++
++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++ void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
++ if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++ matrix *coeffs;
++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
++ RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
++ RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
++ coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
++ for (int j = 0; j < tests[t].height; ++j)
++ for (int i = 0; i < tests[t].width; ++i) {
++ int idx = j * 8 + i;
++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
++ }
++ call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
++ call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
++ if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
++ fail();
++ bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
++ av_free(coeffs);
++ }
++ }
++}
++
++static void check_loop_filter(void)
++{
++ /* Deblocking filter buffers are big enough to hold a 16x16 block,
++ * plus 16 columns left and 4 rows above to hold filter inputs
++ * (depending on whether v or h neighbouring block edge, oversized
++ * horizontally to maintain 16-byte alignment) plus 16 columns and
++ * 4 rows below to catch write overflows */
++ LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
++ LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
++
++ VC1DSPContext h;
++
++ const test tests[] = {
++ VC1DSP_TEST(vc1_v_loop_filter4)
++ VC1DSP_TEST(vc1_h_loop_filter4)
++ VC1DSP_TEST(vc1_v_loop_filter8)
++ VC1DSP_TEST(vc1_h_loop_filter8)
++ VC1DSP_TEST(vc1_v_loop_filter16)
++ VC1DSP_TEST(vc1_h_loop_filter16)
++ };
++
++ ff_vc1dsp_init(&h);
++
++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++ void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
++ if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++ for (int count = 1000; count > 0; --count) {
++ int pq = rnd() % 31 + 1;
++ RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
++ call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
++ call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
++ if (memcmp(filter_buf0, filter_buf1, 24 * 48))
++ fail();
++ }
++ }
++ for (int j = 0; j < 24; ++j)
++ for (int i = 0; i < 48; ++i)
++ filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
++ if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
++ if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
++ }
++}
++
++#define TEST_UNESCAPE \
++ do { \
++ for (int count = 100; count > 0; --count) { \
++ escaped_offset = rnd() & 7; \
++ unescaped_offset = rnd() & 7; \
++ escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \
++ RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \
++ len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
++ len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
++ if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \
++ fail(); \
++ } \
++ } while (0)
++
++static void check_unescape(void)
++{
++ /* This appears to be a typical length of buffer in use */
++#define LOG2_UNESCAPE_BUF_SIZE 17
++#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
++ LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
++ LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
++ LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
++ LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
++
++ VC1DSPContext h;
++
++ ff_vc1dsp_init(&h);
++
++ if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
++ int len0, len1, escaped_offset, unescaped_offset, escaped_len;
++ declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
++
++ /* Test data which consists of escapes sequences packed as tightly as possible */
++ for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
++ escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
++ TEST_UNESCAPE;
++
++ /* Test random data */
++ RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
++ TEST_UNESCAPE;
++
++ /* Test data with escape sequences at random intervals */
++ for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
++ int gap, gap_msb;
++ escaped1[x+0] = escaped0[x+0] = 0;
++ escaped1[x+1] = escaped0[x+1] = 0;
++ escaped1[x+2] = escaped0[x+2] = 3;
++ escaped1[x+3] = escaped0[x+3] = rnd() & 3;
++ gap_msb = 2u << (rnd() % 8);
++ gap = (rnd() &~ -gap_msb) | gap_msb;
++ x += gap;
++ }
++ TEST_UNESCAPE;
++
++ /* Test data which is known to contain no escape sequences */
++ memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
++ memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
++ TEST_UNESCAPE;
++
++ /* Benchmark the no-escape-sequences case */
++ bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
++ }
++}
++
++void checkasm_check_vc1dsp(void)
++{
++ check_inv_trans_inplace();
++ check_inv_trans_adding();
++ report("inv_trans");
++
++ check_loop_filter();
++ report("loop_filter");
++
++ check_unescape();
++ report("unescape_buffer");
++}
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+ fate-checkasm-hevc_add_res \
+ fate-checkasm-hevc_idct \
+ fate-checkasm-hevc_sao \
++ fate-checkasm-idctdsp \
+ fate-checkasm-jpeg2000dsp \
+ fate-checkasm-llviddsp \
+ fate-checkasm-llviddspenc \
+@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+ fate-checkasm-sw_scale \
+ fate-checkasm-v210dec \
+ fate-checkasm-v210enc \
++ fate-checkasm-vc1dsp \
+ fate-checkasm-vf_blend \
+ fate-checkasm-vf_colorspace \
+ fate-checkasm-vf_eq \
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff b/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff
new file mode 100644
index 0000000..ab6f139
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff
@@ -0,0 +1,22 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+--- a/configure
++++ b/configure
+@@ -6471,11 +6471,9 @@ enabled mbedtls && { check_pkg
+ die "ERROR: mbedTLS not found"; }
+ enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+ ( enabled rpi ||
+- enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+- { ! enabled cross_compile &&
+- add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+- add_ldflags -L/opt/vc/lib/ &&
+- check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
++ enabled mmal ) && { { add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
++ add_ldflags -L/opt/vc/lib/ &&
++ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcsm -lvchostif -lvchiq_arm -lvcos; } ||
+ die "ERROR: mmal not found" &&
+ check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+ enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch
new file mode 100644
index 0000000..f153827
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch
@@ -0,0 +1,82 @@
+From 01e738a8f1414acd0102e432bbc15b4e603fd956 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:34:20 -0600
+Subject: [PATCH] configure: setup for OE-core usage
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Add global CFLAGS and LDFLAGS. So, that when
+./configure runs test it's able to locate proper
+headers and libs in a cross-compile environment.
+
+Add new check to opengl. None of the above headers
+exists and we also should be using GLESv2.
+
+Update where compiler finds OMX_Core.h
+
+Only check that sdl2 version greater than 2.0.1
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ configure | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/configure b/configure
+index 723b81f1..0c7f2654 100755
+--- a/configure
++++ b/configure
+@@ -5746,6 +5746,9 @@ enable_weak_pic() {
+ }
+
+ enabled pic && enable_weak_pic
++# Set CFLAGS and LDFLAGS globally
++add_cflags -I${sysroot}/usr/include/ -I${sysroot}/usr/include/IL -I${sysroot}/usr/include/drm
++add_ldflags -L${sysroot}/usr/lib/
+
+ test_cc <<EOF || die "Symbol mangling check failed."
+ int ff_extern;
+@@ -6471,8 +6474,7 @@ enabled mbedtls && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt
+ die "ERROR: mbedTLS not found"; }
+ enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+ ( enabled rpi ||
+- enabled mmal ) && { { add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+- add_ldflags -L/opt/vc/lib/ &&
++ enabled mmal ) && { { add_cflags -I${sysroot}/usr/include/interface/vmcs_host/linux -I${sysroot}/usr/include/interface/vcos/pthreads -fgnu89-inline &&
+ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcsm -lvchostif -lvchiq_arm -lvcos; } ||
+ die "ERROR: mmal not found" &&
+ check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+@@ -6492,15 +6494,15 @@ enabled opengl && { check_lib opengl GL/glx.h glXGetProcAddress "-lGL
+ check_lib opengl windows.h wglGetProcAddress "-lopengl32 -lgdi32" ||
+ check_lib opengl OpenGL/gl3.h glGetError "-Wl,-framework,OpenGL" ||
+ check_lib opengl ES2/gl.h glGetError "-isysroot=${sysroot} -Wl,-framework,OpenGLES" ||
++ check_lib opengl GLES2/gl2.h glGetError "-lGLESv2" ||
+ die "ERROR: opengl not found."
+ }
+-enabled omx_rpi && { test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame ||
++enabled omx_rpi && { test_code cc IL/OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame ||
+ { ! enabled cross_compile &&
+- add_cflags -isystem/opt/vc/include/IL &&
+- test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame; } ||
++ test_code cc IL/OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame; } ||
+ die "ERROR: OpenMAX IL headers from raspberrypi/firmware not found"; } &&
+ enable omx
+-enabled omx && require_headers OMX_Core.h
++enabled omx && require_headers IL/OMX_Core.h
+ enabled openssl && { check_pkg_config openssl openssl openssl/ssl.h OPENSSL_init_ssl ||
+ check_pkg_config openssl openssl openssl/ssl.h SSL_library_init ||
+ check_lib openssl openssl/ssl.h OPENSSL_init_ssl -lssl -lcrypto ||
+@@ -6540,7 +6542,7 @@ fi
+
+ if enabled sdl2; then
+ SDL2_CONFIG="${cross_prefix}sdl2-config"
+- test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 2.1.0" SDL_events.h SDL_PollEvent
++ test_pkg_config sdl2 "sdl2 >= 2.0.1" SDL_events.h SDL_PollEvent
+ if disabled sdl2 && "${SDL2_CONFIG}" --version > /dev/null 2>&1; then
+ sdl2_cflags=$("${SDL2_CONFIG}" --cflags)
+ sdl2_extralibs=$("${SDL2_CONFIG}" --libs)
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
new file mode 100644
index 0000000..43a9191
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
@@ -0,0 +1,111 @@
+From be426ad76c3e486f1364dd292cf8e1c633c80e91 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:39:47 -0600
+Subject: [PATCH] libavdevice: opengl_enc.c update dynamic function loader
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+For meta-raspberrypi ffmpeg builds, when opengl
+is enabled do_compile will fail. Reasion is that
+glGetProcAddress is undefined in either GLES2/gl2.h
+or GLES2/gl2ext.h.
+
+define SelectedGetProcAddress to SDL_GL_GetProcAddress
+if sdl2 is included. If not included, define function
+pointers at compile time versus runtime.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavdevice/opengl_enc.c | 44 ++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 40 insertions(+), 4 deletions(-)
+
+diff --git a/libavdevice/opengl_enc.c b/libavdevice/opengl_enc.c
+index 2bdb8da7..eabc1bf8 100644
+--- a/libavdevice/opengl_enc.c
++++ b/libavdevice/opengl_enc.c
+@@ -37,12 +37,13 @@
+ #include <OpenGL/gl3.h>
+ #elif HAVE_ES2_GL_H
+ #include <ES2/gl.h>
+-#else
+-#include <GL/gl.h>
+-#include <GL/glext.h>
+ #endif
+ #if HAVE_GLXGETPROCADDRESS
+ #include <GL/glx.h>
++#else
++#define GL_GLEXT_PROTOTYPES
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
+ #endif
+
+ #if CONFIG_SDL2
+@@ -493,8 +494,14 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+
+ #if HAVE_GLXGETPROCADDRESS
+ #define SelectedGetProcAddress glXGetProcAddress
++#define CAN_DYNAMIC_LOAD 1
+ #elif HAVE_WGLGETPROCADDRESS
+ #define SelectedGetProcAddress wglGetProcAddress
++#elif CONFIG_SDL2
++#define SelectedGetProcAddress SDL_GL_GetProcAddress
++#define CAN_DYNAMIC_LOAD 1
++#else
++#define CAN_DYNAMIC_LOAD 0
+ #endif
+
+ #define LOAD_OPENGL_FUN(name, type) \
+@@ -504,7 +511,8 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+ return AVERROR(ENOSYS); \
+ }
+
+-#if CONFIG_SDL2
++#if CAN_DYNAMIC_LOAD
++#if CONFIG_SDL2
+ if (!opengl->no_window)
+ return opengl_sdl_load_procedures(opengl);
+ #endif
+@@ -534,9 +542,37 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+ LOAD_OPENGL_FUN(glGetShaderInfoLog, FF_PFNGLGETSHADERINFOLOGPROC)
+ LOAD_OPENGL_FUN(glEnableVertexAttribArray, FF_PFNGLENABLEVERTEXATTRIBARRAYPROC)
+ LOAD_OPENGL_FUN(glVertexAttribPointer, FF_PFNGLVERTEXATTRIBPOINTERPROC)
++#else
++ procs->glActiveTexture = glActiveTexture;
++ procs->glGenBuffers = glGenBuffers;
++ procs->glDeleteBuffers = glDeleteBuffers;
++ procs->glBufferData = glBufferData;
++ procs->glBindBuffer = glBindBuffer;
++ procs->glGetAttribLocation = glGetAttribLocation;
++ procs->glGetUniformLocation = glGetUniformLocation;
++ procs->glUniform1f = glUniform1f;
++ procs->glUniform1i = glUniform1i;
++ procs->glUniformMatrix4fv = glUniformMatrix4fv;
++ procs->glCreateProgram = glCreateProgram;
++ procs->glDeleteProgram = glDeleteProgram;
++ procs->glUseProgram = glUseProgram;
++ procs->glLinkProgram = glLinkProgram;
++ procs->glGetProgramiv = glGetProgramiv;
++ procs->glGetProgramInfoLog = glGetProgramInfoLog;
++ procs->glAttachShader = glAttachShader;
++ procs->glCreateShader = glCreateShader;
++ procs->glDeleteShader = glDeleteShader;
++ procs->glCompileShader = glCompileShader;
++ procs->glShaderSource = glShaderSource;
++ procs->glGetShaderiv = glGetShaderiv;
++ procs->glGetShaderInfoLog = glGetShaderInfoLog;
++ procs->glEnableVertexAttribArray = glEnableVertexAttribArray;
++ procs->glVertexAttribPointer = (FF_PFNGLVERTEXATTRIBPOINTERPROC) glVertexAttribPointer;
++#endif
+
+ return 0;
+
++#undef CAN_DYNAMIC_LOAD
+ #undef SelectedGetProcAddress
+ #undef LOAD_OPENGL_FUN
+ }
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch
new file mode 100644
index 0000000..2232c48
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch
@@ -0,0 +1,45 @@
+From 62c2f041890a6e20770350721a0a2138d0b38634 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Sat, 3 Dec 2022 23:35:51 -0600
+Subject: [PATCH] libavcodec: fix v4l2_req_devscan.h
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Fixes minor differences between v4l2_req_devscan.c
+and v4l2_req_devscan.h after all patches have been
+applied.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavcodec/v4l2_req_devscan.h | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
+index 0baef365..cd9c49ac 100644
+--- a/libavcodec/v4l2_req_devscan.h
++++ b/libavcodec/v4l2_req_devscan.h
+@@ -1,6 +1,8 @@
+ #ifndef _DEVSCAN_H_
+ #define _DEVSCAN_H_
+
++#include <stdint.h>
++
+ struct devscan;
+ struct decdev;
+ enum v4l2_buf_type;
+@@ -13,7 +15,8 @@ const char *decdev_video_path(const struct decdev *const dev);
+ enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
+ uint32_t decdev_src_pixelformat(const struct decdev *const dev);
+
+-const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
++const struct decdev *devscan_find(struct devscan *const scan,
++ const uint32_t src_fmt_v4l2);
+
+ int devscan_build(void * const dc, struct devscan **pscan);
+ void devscan_delete(struct devscan **const pScan);
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
new file mode 100644
index 0000000..02c07de
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
@@ -0,0 +1,35 @@
+From 0dfb56e12fa709794525cda1471091f6699905d5 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:49:03 -0600
+Subject: [PATCH] libavcodec: omx replace /opt/vc path with /usr/lib
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Configures omx.c for OE usages as libbcm_host.so
+and libopenmaxil.so are located in a different
+location.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavcodec/omx.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/libavcodec/omx.c b/libavcodec/omx.c
+index 0a6a3083..8c6e9193 100644
+--- a/libavcodec/omx.c
++++ b/libavcodec/omx.c
+@@ -141,7 +141,7 @@ static av_cold OMXContext *omx_init(void *logctx, const char *libname, const cha
+ {
+ static const char * const libnames[] = {
+ #if CONFIG_OMX_RPI
+- "/opt/vc/lib/libopenmaxil.so", "/opt/vc/lib/libbcm_host.so",
++ "/usr/lib/libopenmaxil.so", "/usr/lib/libbcm_host.so",
+ #else
+ "libOMX_Core.so", NULL,
+ "libOmxCore.so", NULL,
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb
new file mode 100644
index 0000000..5a8ff8f
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb
@@ -0,0 +1,198 @@
+SUMMARY = "A complete, cross-platform solution to record, convert and stream audio and video."
+DESCRIPTION = "FFmpeg is the leading multimedia framework, able to decode, encode, transcode, \
+ mux, demux, stream, filter and play pretty much anything that humans and machines \
+ have created. It supports the most obscure ancient formats up to the cutting edge."
+HOMEPAGE = "https://www.ffmpeg.org/"
+SECTION = "libs"
+
+LICENSE = "GPL-2.0-or-later & LGPL-2.1-or-later & ISC & MIT & BSD-2-Clause & BSD-3-Clause & IJG"
+LICENSE:${PN} = "GPL-2.0-or-later"
+LICENSE:libavcodec = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavdevice = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavfilter = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavformat = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavutil = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libpostproc = "GPL-2.0-or-later"
+LICENSE:libswresample = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libswscale = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE_FLAGS = "commercial"
+
+LIC_FILES_CHKSUM = "file://COPYING.GPLv2;md5=b234ee4d69f5fce4486a80fdaf4a4263 \
+ file://COPYING.GPLv3;md5=d32239bcb673463ab874e80d47fae504 \
+ file://COPYING.LGPLv2.1;md5=bd7a443320af8c812e4c18d1b79df004 \
+ file://COPYING.LGPLv3;md5=e6a600fd5e1d9cbde2d983680233ad02"
+
+# Build fails when thumb is enabled: https://bugzilla.yoctoproject.org/show_bug.cgi?id=7717
+ARM_INSTRUCTION_SET:armv4 = "arm"
+ARM_INSTRUCTION_SET:armv5 = "arm"
+ARM_INSTRUCTION_SET:armv6 = "arm"
+# Should be API compatible with libav (which was a fork of ffmpeg)
+# libpostproc was previously packaged from a separate recipe
+PROVIDES = "ffmpeg libav libpostproc"
+RPROVIDES:${PN} = "${PROVIDES}"
+DEPENDS = "nasm-native"
+
+inherit autotools pkgconfig
+PACKAGECONFIG ??= "avdevice avfilter avcodec avformat swresample swscale postproc avresample ffplay \
+ v4l2 drm udev alsa bzlib lzma pic pthreads shared theora zlib libvorbis x264 gpl \
+ ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal rpi sand vout-drm', d)} \
+ ${@bb.utils.contains('AVAILTUNES', 'mips32r2', 'mips32r2', '', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', 'opengl', '', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'xv xcb vout-egl epoxy', '', d)}"
+
+SRC_URI = "\
+ git://git@github.com/RPi-Distro/ffmpeg;protocol=https;branch=pios/bullseye \
+ file://0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch \
+ file://0002-Fix-build-on-powerpc-and-ppc64.patch \
+ file://0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch \
+ file://0004-ffmpeg-4.3.4-rpi_14.patch \
+ file://0005-fix-flags.diff \
+ file://2001-configure-setup-for-OE-core-usage.patch \
+ file://2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch \
+ file://2003-libavcodec-fix-v4l2_req_devscan.patch \
+ file://2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch \
+ "
+
+SRCREV = "246e1a55a0eca931537d8706acd8b133c07beb05"
+
+S = "${WORKDIR}/git"
+
+# libraries to build in addition to avutil
+PACKAGECONFIG[avdevice] = "--enable-avdevice,--disable-avdevice"
+PACKAGECONFIG[avfilter] = "--enable-avfilter,--disable-avfilter"
+PACKAGECONFIG[avcodec] = "--enable-avcodec,--disable-avcodec"
+PACKAGECONFIG[avformat] = "--enable-avformat,--disable-avformat"
+PACKAGECONFIG[swresample] = "--enable-swresample,--disable-swresample"
+PACKAGECONFIG[swscale] = "--enable-swscale,--disable-swscale"
+PACKAGECONFIG[postproc] = "--enable-postproc,--disable-postproc"
+PACKAGECONFIG[avresample] = "--enable-avresample,--disable-avresample"
+
+# features to support
+PACKAGECONFIG[ffplay] = "--enable-ffplay,--disable-ffplay"
+PACKAGECONFIG[alsa] = "--enable-alsa,--disable-alsa,alsa-lib"
+PACKAGECONFIG[altivec] = "--enable-altivec,--disable-altivec,"
+PACKAGECONFIG[bzlib] = "--enable-bzlib,--disable-bzlib,bzip2"
+PACKAGECONFIG[fdk-aac] = "--enable-libfdk-aac --enable-nonfree,--disable-libfdk-aac,fdk-aac"
+PACKAGECONFIG[gpl] = "--enable-gpl,--disable-gpl"
+PACKAGECONFIG[opengl] = "--enable-opengl,--disable-opengl,virtual/libgles2"
+PACKAGECONFIG[gsm] = "--enable-libgsm,--disable-libgsm,libgsm"
+PACKAGECONFIG[jack] = "--enable-indev=jack,--disable-indev=jack,jack"
+PACKAGECONFIG[libvorbis] = "--enable-libvorbis,--disable-libvorbis,libvorbis"
+PACKAGECONFIG[libopus] = "--enable-libopus,--disable-libopus,libopus"
+PACKAGECONFIG[lzma] = "--enable-lzma,--disable-lzma,xz"
+PACKAGECONFIG[mfx] = "--enable-libmfx,--disable-libmfx,intel-mediasdk"
+PACKAGECONFIG[mp3lame] = "--enable-libmp3lame,--disable-libmp3lame,lame"
+PACKAGECONFIG[openssl] = "--enable-openssl,--disable-openssl,openssl"
+PACKAGECONFIG[sdl2] = "--enable-sdl2,--disable-sdl2,virtual/libsdl2"
+PACKAGECONFIG[speex] = "--enable-libspeex,--disable-libspeex,speex"
+PACKAGECONFIG[srt] = "--enable-libsrt,--disable-libsrt,srt"
+PACKAGECONFIG[theora] = "--enable-libtheora,--disable-libtheora,libtheora libogg"
+PACKAGECONFIG[vaapi] = "--enable-vaapi,--disable-vaapi,libva"
+PACKAGECONFIG[vdpau] = "--enable-vdpau,--disable-vdpau,libvdpau"
+PACKAGECONFIG[vpx] = "--enable-libvpx,--disable-libvpx,libvpx"
+PACKAGECONFIG[x264] = "--enable-libx264,--disable-libx264,x264"
+PACKAGECONFIG[xcb] = "--enable-libxcb,--disable-libxcb,libxcb"
+PACKAGECONFIG[xv] = "--enable-outdev=xv,--disable-outdev=xv,libxv"
+PACKAGECONFIG[zlib] = "--enable-zlib,--disable-zlib,zlib"
+PACKAGECONFIG[snappy] = "--enable-libsnappy,--disable-libsnappy,snappy"
+PACKAGECONFIG[udev] = "--enable-libudev,--disable-libudev,udev"
+PACKAGECONFIG[drm] = "--enable-libdrm,--disable-libdrm,libdrm"
+PACKAGECONFIG[epoxy] = "--enable-epoxy,--disable-epoxy,libepoxy"
+PACKAGECONFIG[v4l2] = "--enable-libv4l2 --enable-v4l2-m2m,,v4l-utils"
+PACKAGECONFIG[mmal] = "--enable-omx --enable-omx-rpi --enable-mmal,,userland"
+PACKAGECONFIG[sand] = "--enable-sand,,"
+PACKAGECONFIG[rpi] = "--enable-rpi,,"
+PACKAGECONFIG[vout-drm] = "--enable-vout-drm,,libdrm"
+PACKAGECONFIG[vout-egl] = "--enable-vout-egl,,virtual/egl"
+
+# other configuration options
+PACKAGECONFIG[mips32r2] = ",--disable-mipsdsp --disable-mipsdspr2"
+PACKAGECONFIG[pic] = "--enable-pic"
+PACKAGECONFIG[pthreads] = "--enable-pthreads,--disable-pthreads"
+PACKAGECONFIG[shared] = "--enable-shared"
+PACKAGECONFIG[strip] = ",--disable-stripping"
+
+# Check codecs that require --enable-nonfree
+USE_NONFREE = "${@bb.utils.contains_any('PACKAGECONFIG', [ 'openssl' ], 'yes', '', d)}"
+
+def cpu(d):
+ for arg in (d.getVar('TUNE_CCARGS') or '').split():
+ if arg.startswith('-mcpu='):
+ return arg[6:]
+ return 'generic'
+
+EXTRA_OECONF = " \
+ ${@bb.utils.contains('USE_NONFREE', 'yes', '--enable-nonfree', '', d)} \
+ \
+ --cross-prefix=${TARGET_PREFIX} \
+ \
+ --ld="${CCLD}" \
+ --cc="${CC}" \
+ --cxx="${CXX}" \
+ --arch=${TARGET_ARCH} \
+ --target-os="linux" \
+ --enable-cross-compile \
+ --extra-cflags="${CFLAGS} ${HOST_CC_ARCH}${TOOLCHAIN_OPTIONS}" \
+ --extra-ldflags="${LDFLAGS}" \
+ --sysroot="${STAGING_DIR_TARGET}" \
+ ${EXTRA_FFCONF} \
+ --libdir=${libdir} \
+ --shlibdir=${libdir} \
+ --datadir=${datadir}/ffmpeg \
+ --cpu=${@cpu(d)} \
+ --pkg-config=pkg-config \
+"
+EXTRA_OECONF:append:linux-gnux32 = " --disable-asm"
+
+# Some patches introduce assembly files which needs preprocessing with
+# gcc e.g. src/libavutil/aarch64/rpi_sand_neon.S
+TOOLCHAIN = "gcc"
+# gold crashes on x86, another solution is to --disable-asm but thats more hacky
+# ld.gold: internal error in relocate_section, at ../../gold/i386.cc:3684
+LDFLAGS:append:x86 = "${@bb.utils.contains('DISTRO_FEATURES', 'ld-is-gold', ' -fuse-ld=bfd ', '', d)}"
+EXTRA_OEMAKE = "V=1"
+
+do_configure() {
+ ${S}/configure ${EXTRA_OECONF}
+}
+
+# patch out build host paths for reproducibility
+do_compile:prepend:class-target() {
+ sed -i -e "s,${WORKDIR},,g" ${B}/config.h
+}
+
+PACKAGES =+ "libavcodec \
+ libavdevice \
+ libavfilter \
+ libavformat \
+ libavresample \
+ libavutil \
+ libpostproc \
+ libswresample \
+ libswscale"
+
+FILES:${PN}:append = " /usr/share/ffmpeg"
+FILES:libavcodec = "${libdir}/libavcodec${SOLIBS}"
+FILES:libavdevice = "${libdir}/libavdevice${SOLIBS}"
+FILES:libavfilter = "${libdir}/libavfilter${SOLIBS}"
+FILES:libavformat = "${libdir}/libavformat${SOLIBS}"
+FILES:libavresample = "${libdir}/libavresample${SOLIBS}"
+FILES:libavutil = "${libdir}/libavutil${SOLIBS}"
+FILES:libpostproc = "${libdir}/libpostproc${SOLIBS}"
+FILES:libswresample = "${libdir}/libswresample${SOLIBS}"
+FILES:libswscale = "${libdir}/libswscale${SOLIBS}"
+# ffmpeg disables PIC on some platforms (e.g. x86-32)
+INSANE_SKIP:${MLPREFIX}libavcodec = "textrel"
+INSANE_SKIP:${MLPREFIX}libavdevice = "textrel"
+INSANE_SKIP:${MLPREFIX}libavfilter = "textrel"
+INSANE_SKIP:${MLPREFIX}libavformat = "textrel"
+INSANE_SKIP:${MLPREFIX}libavutil = "textrel"
+INSANE_SKIP:${MLPREFIX}libavresample = "textrel"
+INSANE_SKIP:${MLPREFIX}libswscale = "textrel"
+INSANE_SKIP:${MLPREFIX}libswresample = "textrel"
+INSANE_SKIP:${MLPREFIX}libpostproc = "textrel"
+
+# Only enable it for rpi class of machines
+COMPATIBLE_HOST = "null"
+COMPATIBLE_HOST:rpi = "(.*)"
+
diff --git a/wic/sdimage-raspberrypi.wks b/wic/sdimage-raspberrypi.wks
index 01fbaea..bb41e0f 100644
--- a/wic/sdimage-raspberrypi.wks
+++ b/wic/sdimage-raspberrypi.wks
@@ -2,5 +2,5 @@
# long-description: Creates a partitioned SD card image for use with
# Raspberry Pi. Boot files are located in the first vfat partition.
-part /boot --source bootimg-partition --ondisk mmcblk0 --fstype=vfat --label boot --active --align 4096 --size 20
+part /boot --source bootimg-partition --ondisk mmcblk0 --fstype=vfat --label boot --active --align 4096 --size 100
part / --source rootfs --ondisk mmcblk0 --fstype=ext4 --label root --align 4096